Example #1
0
def main(args):

    sentence = args.sentence.lower()
    args.sentence = sentence
    tokens = sentence.split()
    grammar = loadGrammar(args)
    nonterm = getnonterm(grammar)
    terminalProductionRules = getTerminalProbability(args, grammar, nonterm)
    HSrules = grammar.productions(Nonterminal('HS'))
    for rule in HSrules:
        grammar.productions().remove(rule)

    ESrules = grammar.productions(Nonterminal('ES'))
    for rule in ESrules:
        grammar.productions().remove(rule)

    grammar.productions().extend(terminalProductionRules)

    for token in tokens:
        grammar.productions().append(
            ProbabilisticProduction(Nonterminal(token.upper()),
                                    [unicode(token)],
                                    prob=1))

    #print "Grammars"
    grammarlist = str(grammar).split('\n')[1:]

    #print "Transfered"
    strgrammar = ''
    for p in grammar.productions():
        rhs = p.rhs()
        rhsstr = ''
        for r in rhs:
            if is_terminal(r):
                rhsstr += '\'' + str(r) + '\' '
            else:
                rhsstr += str(r) + ' '
        strgrammar += str(p.lhs()) + ' -> ' + rhsstr + ' [' + '{0:.8f}'.format(
            p.prob()) + ']\n'
    #print strgrammar

    grammar = PCFG.fromstring(strgrammar.split('\n'))
    #'''
    #grammar = loadGrammar(args)

    #tokens = args.sentence.lower().split()
    #nonterm = getnonterm(grammar)

    CYK(tokens, nonterm, grammar)
    #with open(args.grammar_file, 'r') as f:
    #        content = f.read()

    #trees = corpus2trees(content)
    #productions = trees2productions(trees)
    #listnonterm = []
    #grammar = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal('SS'), productions)
    #print grammar

    #'''
    '''
Example #2
0
def random_sentences(grammar_string, n=None, depth=5):
    grammar = PCFG.fromstring(grammar_string)
    i = 0
    while True:
        if i == n: return
        tree = generate(grammar, depth=depth)
        yield ' '.join(utils.flatten(tree))
        i += 1
Example #3
0
def test1(test_str):
  toy_pcfg = PCFG.fromstring("""
  S -> A A [0.8] | A B [.1] | A S [.1]
  A -> A A [.6] | A B [.2] | 'a' [.2]
  B -> B A [.3] | A B [.2] | 'b' [.5]
  """)
  for t in CYK(toy_pcfg,test_str.split(" "),5):
    t.draw()
Example #4
0
def test1(test_str):
    toy_pcfg = PCFG.fromstring(
        """
  S -> A A [0.8] | A B [.1] | A S [.1]
  A -> A A [.6] | A B [.2] | 'a' [.2]
  B -> B A [.3] | A B [.2] | 'b' [.5]
  """
    )
    for t in CYK(toy_pcfg, test_str.split(" "), 5):
        t.draw()
def loadPCFG(input_PCFG_filename):
    '''
    Loader function for pcfg
    args:input_PCFG_filename(str) a location of a file
    returns:nltk.grammar.PCFG object
    '''
    string = ''
    with open(input_PCFG_filename,'r') as f:
        for l in f:
            string += l
    return PCFG.fromstring(string)
Example #6
0
def langley_2(depth=5, n=500):
    G = PCFG.fromstring("""
    S -> NP VP [1.0]
    VP -> V NP [1.0]
    NP -> Det N [0.5] | Det N RC [0.5]
    RC -> Rel VP [1.0]
    Det -> 'the' [0.5] | 'a' [0.5]
    V -> 'saw' [0.5] | 'heard' [0.5]
    N -> 'cat' [0.3333] | 'dog' [0.3333] | 'mouse' [0.3333]
    Rel -> 'that' [1.0]
    """)
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C
beta_20 = round(beta/20,4) #se redondea a cuatro decimales
print ("beta_20: " + str(beta_20))
#beta_20 y compl_beta_20 deben sumar siempre 0.05
compl_beta_20 = 0.05 - beta_20
print ("compl_beta_20: " + str(compl_beta_20))

grammar = """
    S -> A V1 B [0.5] | C W1 D [0.45] | C W2 D [""" + str(beta_20) + """] | A W2 B [""" + str(compl_beta_20) + """]
    A -> 'a0' [0.1] | 'a1' [0.1] | 'a2' [0.1] | 'a3' [0.1] | 'a4' [0.1] | 'a5' [0.1] | 'a6' [0.1] | 'a7' [0.1] | 'a8' [0.1] | 'a9' [0.1]
    B -> 'b0' [0.1] | 'b1' [0.1] | 'b2' [0.1] | 'b3' [0.1] | 'b4' [0.1] | 'b5' [0.1] | 'b6' [0.1] | 'b7' [0.1] | 'b8' [0.1] | 'b9' [0.1]
    C -> 'c0' [0.1] | 'c1' [0.1] | 'c2' [0.1] | 'c3' [0.1] | 'c4' [0.1] | 'c5' [0.1] | 'c6' [0.1] | 'c7' [0.1] | 'c8' [0.1] | 'c9' [0.1]
    D -> 'd0' [0.1] | 'd1' [0.1] | 'd2' [0.1] | 'd3' [0.1] | 'd4' [0.1] | 'd5' [0.1] | 'd6' [0.1] | 'd7' [0.1] | 'd8' [0.1] | 'd9' [0.1]    
    V1 -> 'v0' [0.02] | 'v1' [0.02] | 'v2' [0.02] | 'v3' [0.02] | 'v4' [0.02] | 'v5' [0.02] | 'v6' [0.02] | 'v7' [0.02] | 'v8' [0.02] | 'v9' [0.02] | 'v10' [0.02] | 'v11' [0.02] | 'v12' [0.02] | 'v13' [0.02] | 'v14' [0.02] | 'v15' [0.02] | 'v16' [0.02] | 'v17' [0.02] | 'v18' [0.02] | 'v19' [0.02] | 'v20' [0.02] | 'v21' [0.02] | 'v22' [0.02] | 'v23' [0.02] | 'v24' [0.02] | 'v25' [0.02] | 'v26' [0.02] | 'v27' [0.02] | 'v28' [0.02] | 'v29' [0.02] | 'v30' [0.02] | 'v31' [0.02] | 'v32' [0.02] | 'v33' [0.02] | 'v34' [0.02] | 'v35' [0.02] | 'v36' [0.02] | 'v37' [0.02] | 'v38' [0.02] | 'v39' [0.02] | 'v40' [0.02] | 'v41' [0.02] | 'v42' [0.02] | 'v43' [0.02] | 'v44' [0.02] | 'v45' [0.02] | 'v46' [0.02] | 'v47' [0.02] | 'v48' [0.02] | 'v49' [0.02]
    W1 ->'w5' [0.0232] | 'w6' [0.0222] | 'w7' [0.0222] | 'w8' [0.0222] | 'w9' [0.0222] | 'w10' [0.0222] | 'w11' [0.0222] | 'w12' [0.0222] | 'w13' [0.0222] | 'w14' [0.0222] | 'w15' [0.0222] | 'w16' [0.0222] | 'w17' [0.0222] | 'w18' [0.0222] | 'w19' [0.0222] | 'w20' [0.0222] | 'w21' [0.0222] | 'w22' [0.0222] | 'w23' [0.0222] | 'w24' [0.0222] | 'w25' [0.0222] | 'w26' [0.0222] | 'w27' [0.0222] | 'w28' [0.0222] | 'w29' [0.0222] | 'w30' [0.0222] | 'w31' [0.0222] | 'w32' [0.0222] | 'w33' [0.0222] | 'w34' [0.0222] | 'w35' [0.0222] | 'w36' [0.0222] | 'w37' [0.0222] | 'w38' [0.0222] | 'w39' [0.0222] | 'w40' [0.0222] | 'w41' [0.0222] | 'w42' [0.0222] | 'w43' [0.0222] | 'w44' [0.0222] | 'w45' [0.0222] | 'w46' [0.0222] | 'w47' [0.0222] | 'w48' [0.0222] | 'w49' [0.0222] 
    W2 -> 'w0' [0.2] | 'w1' [0.2] | 'w2' [0.2] | 'w3' [0.2] | 'w4' [0.2] 

"""
print(grammar)

ambiguity = PCFG.fromstring(grammar)
print(ambiguity)


archivo_destino = 'ambiguity_alfa_' + str(alfa) + '.txt'
with open(archivo_destino, 'w') as f:
    for sentence in pcfg_generate(ambiguity, n=100000, depth=100):
       f.write(' '.join(sentence) +'\n')



Example #8
0
    def test_parse_2(self):
        grammar = PCFG.fromstring(
            """
                S -> NP VP              [1.0]
                NP -> NP PP             [0.5]
                NP -> Det Noun          [0.5]
                VP -> VP PP             [0.9]
                VP -> Verb NP           [0.1]
                PP -> Prep NP           [1.0]
                Noun -> 'dog'           [0.2]
                Noun -> 'man'           [0.2]
                Noun -> 'town'          [0.6]
                Verb -> 'saw'           [1.0]
                Prep -> 'in'            [1.0]
                Det -> 'the'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog in the town'.split())

        # check chart
        pi = {
                 (1, 1): {'Det': log2(1.0)},
                 (2, 2): {'Noun': log2(0.2)},
                 (3, 3): {'Verb': log2(1.0)},
                 (4, 4): {'Det': log2(1.0)},
                 (5, 5): {'Noun': log2(0.2)},
                 (6, 6): {'Prep': log2(1.0)},
                 (7, 7): {'Det': log2(1.0)},
                 (8, 8): {'Noun': log2(0.6)},

                 (1, 2): {'NP': -3.321928094887362},
                 (2, 3): {},
                 (3, 4): {},
                 (4, 5): {'NP': -3.321928094887362},
                 (5, 6): {},
                 (6, 7): {},
                 (7, 8): {'NP': -1.736965594166206},

                 (1, 3): {},
                 (2, 4): {},
                 (3, 5): {'VP': -6.643856189774724},
                 (4, 6): {},
                 (5, 7): {},
                 (6, 8): {'PP': -1.736965594166206},

                 (1, 4): {},
                 (2, 5): {},
                 (3, 6): {},
                 (4, 7): {},
                 (5, 8): {},

                 (1, 5): {'S': -9.965784284662087},
                 (2, 6): {},
                 (3, 7): {},
                 (4, 8): {'NP': -6.058893689053567},

                 (1, 6): {},
                 (2, 7): {},
                 (3, 8): {'VP': -8.53282487738598},

                 (1, 7): {},
                 (2, 8): {},

                 (1, 8): {'S': -11.854752972273342},

                 }
        self.assertEqualPi(parser._pi, pi)

        bp = {
            (1, 1): {'Det': Tree.fromstring('(Det the)')},
            (2, 2): {'Noun': Tree.fromstring('(Noun man)')},
            (3, 3): {'Verb': Tree.fromstring('(Verb saw)')},
            (4, 4): {'Det': Tree.fromstring('(Det the)')},
            (5, 5): {'Noun': Tree.fromstring('(Noun dog)')},
            (6, 6): {'Prep': Tree.fromstring('(Prep in)')},
            (7, 7): {'Det': Tree.fromstring('(Det the)')},
            (8, 8): {'Noun': Tree.fromstring('(Noun town)')},
            (1, 2): {'NP': Tree.fromstring('(NP (Det the) (Noun man))')},

            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': Tree.fromstring('(NP (Det the) (Noun dog))')},
            (5, 6): {},
            (6, 7): {},
            (7, 8): {'NP': Tree.fromstring('(NP (Det the) (Noun town))')},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': Tree.fromstring(
                '(VP (Verb saw) (NP (Det the) (Noun dog)))')},
            (4, 6): {},
            (5, 7): {},
            (6, 8): {'PP': Tree.fromstring(
                '(PP (Prep in) (NP (Det the) (Noun town)))')},

            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},

            (1, 5): {'S': Tree.fromstring(
                """(S
                      (NP (Det the) (Noun man))
                      (VP (Verb saw) (NP (Det the) (Noun dog))))""")},
            (2, 6): {},
            (3, 7): {},
            (4, 8): {'NP': Tree.fromstring(
                """(NP
                      (NP (Det the) (Noun dog))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")},
            (1, 6): {},
            (2, 7): {},
            (3, 8): {'VP': Tree.fromstring(
                """(VP
                      (VP (Verb saw) (NP (Det the) (Noun dog)))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")},
            (1, 7): {},
            (2, 8): {},

            (1, 8): {'S': Tree.fromstring(
                """(S
                      (NP (Det the) (Noun man))
                      (VP
                        (VP (Verb saw) (NP (Det the) (Noun dog)))
                        (PP (Prep in) (NP (Det the) (Noun town)))))""")},
            }

        self.assertEqual(parser._bp, bp)
Example #9
0
    def test_parse_2(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                NP -> NP PP             [0.5]
                NP -> Det Noun          [0.5]
                VP -> VP PP             [0.9]
                VP -> Verb NP           [0.1]
                PP -> Prep NP           [1.0]
                Noun -> 'dog'           [0.2]
                Noun -> 'man'           [0.2]
                Noun -> 'town'          [0.6]
                Verb -> 'saw'           [1.0]
                Prep -> 'in'            [1.0]
                Det -> 'the'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog in the town'.split())

        # check chart
        pi = {
            (1, 1): {
                'Det': log2(1.0)
            },
            (2, 2): {
                'Noun': log2(0.2)
            },
            (3, 3): {
                'Verb': log2(1.0)
            },
            (4, 4): {
                'Det': log2(1.0)
            },
            (5, 5): {
                'Noun': log2(0.2)
            },
            (6, 6): {
                'Prep': log2(1.0)
            },
            (7, 7): {
                'Det': log2(1.0)
            },
            (8, 8): {
                'Noun': log2(0.6)
            },
            (1, 2): {
                'NP': -3.321928094887362
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': -3.321928094887362
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': -1.736965594166206
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': -6.643856189774724
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP': -1.736965594166206
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S': -9.965784284662087
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP': -6.058893689053567
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP': -8.53282487738598
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S': -11.854752972273342
            },
        }
        self.assertEqualPi(parser._pi, pi)

        bp = {
            (1, 1): {
                'Det': Tree.fromstring('(Det the)')
            },
            (2, 2): {
                'Noun': Tree.fromstring('(Noun man)')
            },
            (3, 3): {
                'Verb': Tree.fromstring('(Verb saw)')
            },
            (4, 4): {
                'Det': Tree.fromstring('(Det the)')
            },
            (5, 5): {
                'Noun': Tree.fromstring('(Noun dog)')
            },
            (6, 6): {
                'Prep': Tree.fromstring('(Prep in)')
            },
            (7, 7): {
                'Det': Tree.fromstring('(Det the)')
            },
            (8, 8): {
                'Noun': Tree.fromstring('(Noun town)')
            },
            (1, 2): {
                'NP': Tree.fromstring('(NP (Det the) (Noun man))')
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': Tree.fromstring('(NP (Det the) (Noun dog))')
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': Tree.fromstring('(NP (Det the) (Noun town))')
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP':
                Tree.fromstring('(VP (Verb saw) (NP (Det the) (Noun dog)))')
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP':
                Tree.fromstring('(PP (Prep in) (NP (Det the) (Noun town)))')
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S':
                Tree.fromstring("""(S
                      (NP (Det the) (Noun man))
                      (VP (Verb saw) (NP (Det the) (Noun dog))))""")
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP':
                Tree.fromstring("""(NP
                      (NP (Det the) (Noun dog))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP':
                Tree.fromstring("""(VP
                      (VP (Verb saw) (NP (Det the) (Noun dog)))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S':
                Tree.fromstring("""(S
                      (NP (Det the) (Noun man))
                      (VP
                        (VP (Verb saw) (NP (Det the) (Noun dog)))
                        (PP (Prep in) (NP (Det the) (Noun town)))))""")
            },
        }

        self.assertEqual(parser._bp, bp)
Example #10
0
    def test_parse(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                NP -> Det Noun          [0.6]
                NP -> Noun Adj          [0.4]
                VP -> Verb NP           [1.0]
                Det -> 'el'             [1.0]
                Noun -> 'gato'          [0.9]
                Noun -> 'pescado'       [0.1]
                Verb -> 'come'          [1.0]
                Adj -> 'crudo'          [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('el gato come pescado crudo'.split())

        # check chart
        pi = {
            (1, 1): {
                'Det': log2(1.0)
            },
            (2, 2): {
                'Noun': log2(0.9)
            },
            (3, 3): {
                'Verb': log2(1.0)
            },
            (4, 4): {
                'Noun': log2(0.1)
            },
            (5, 5): {
                'Adj': log2(1.0)
            },
            (1, 2): {
                'NP': log2(0.6 * 1.0 * 0.9)
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': log2(0.4 * 0.1 * 1.0)
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)
            },
            (1, 4): {},
            (2, 5): {},
            (1, 5): {
                'S':
                log2(1.0) +  # rule S -> NP VP
                log2(0.6 * 1.0 * 0.9) +  # left part
                log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)
            },  # right part
        }
        self.assertEqualPi(parser._pi, pi)

        # check partial results
        bp = {
            (1, 1): {
                'Det': Tree.fromstring("(Det el)")
            },
            (2, 2): {
                'Noun': Tree.fromstring("(Noun gato)")
            },
            (3, 3): {
                'Verb': Tree.fromstring("(Verb come)")
            },
            (4, 4): {
                'Noun': Tree.fromstring("(Noun pescado)")
            },
            (5, 5): {
                'Adj': Tree.fromstring("(Adj crudo)")
            },
            (1, 2): {
                'NP': Tree.fromstring("(NP (Det el) (Noun gato))")
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))")
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP':
                Tree.fromstring(
                    "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))")
            },
            (1, 4): {},
            (2, 5): {},
            (1, 5): {
                'S':
                Tree.fromstring("""(S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                   )
                """)
            },
        }
        self.assertEqual(parser._bp, bp)

        # check tree
        t2 = Tree.fromstring("""
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)
        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0)
        self.assertAlmostEqual(lp, lp2)
from nltk.grammar import Nonterminal, PCFG
from pcfg_generate import *


sparseness = PCFG.fromstring("""
    S -> A V B [0.5] | C W D [0.5] 
    A -> 'a0' [0.1] | 'a1' [0.1] | 'a2' [0.1] | 'a3' [0.1] | 'a4' [0.1] | 'a5' [0.1] | 'a6' [0.1] | 'a7' [0.1] | 'a8' [0.1] | 'a9' [0.1]
    B -> 'b0' [0.1] | 'b1' [0.1] | 'b2' [0.1] | 'b3' [0.1] | 'b4' [0.1] | 'b5' [0.1] | 'b6' [0.1] | 'b7' [0.1] | 'b8' [0.1] | 'b9' [0.1]
    C -> 'c0' [0.1] | 'c1' [0.1] | 'c2' [0.1] | 'c3' [0.1] | 'c4' [0.1] | 'c5' [0.1] | 'c6' [0.1] | 'c7' [0.1] | 'c8' [0.1] | 'c9' [0.1]
    D -> 'd0' [0.1] | 'd1' [0.1] | 'd2' [0.1] | 'd3' [0.1] | 'd4' [0.1] | 'd5' [0.1] | 'd6' [0.1] | 'd7' [0.1] | 'd8' [0.1] | 'd9' [0.1]
    V -> 'v0' [0.1] | 'v1' [0.1] | 'v2' [0.1] | 'v3' [0.1] | 'v4' [0.1] | 'v5' [0.1] | 'v6' [0.1] | 'v7' [0.1] | 'v8' [0.1] | 'v9' [0.1]    
    W -> 'w0' [0.1] | 'w1' [0.1] | 'w2' [0.1] | 'w3' [0.1] | 'w4' [0.1] | 'w5' [0.1] | 'w6' [0.1] | 'w7' [0.1] | 'w8' [0.1] | 'w9' [0.1]    
""")
print(sparseness)

texto = ""
context_ab = ""
context_cd = ""
for sentence in pcfg_generate(sparseness, n=100000, depth=6):
    #print(' '.join(sentence))
    #print(sentence)
    str1 = ' '.join(sentence)
    texto = texto + str1 + '\n' 

for i in range(10):
    context_ab = context_ab + 'a' +str(i) + ' ' + 'u' +str(i) + ' ' + 'b' +str(i) + ' '   + '\n' 
  	
for i in range(10):
    context_cd = context_cd + 'c' +str(i) + ' ' + 'x' +str(i) + ' ' + 'd' +str(i) + ' '   + '\n' 

texto = texto + context_ab + context_cd
Example #12
0
def demo(choice=None, draw_parses=None, print_parses=None):
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    if choice is None:
        # Ask the user which demo they want to use.
        print()
        for i in range(len(demos)):
            print('%3s: %s' % (i+1, demos[i][0]))
            print('     %r' % demos[i][1])
            print()
        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
        choice = int(sys.stdin.readline().strip())-1
    try:
        sent, grammar = demos[choice]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)   # was BeamParser
        ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
        parser.trace(3)
        t = time.time()
        parses = list(parser.parse(tokens))
        times.append(time.time()-t)
        p = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0)
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses: all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print('       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
    print('------------------------+------------------------------------------')
    for i in range(len(parsers)):
        print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                             parsers[i].beam_size,
                                             times[i],num_parses[i],average_p[i]))
    parses = all_parses.keys()
    if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
    else: p = 0
    print('------------------------+------------------------------------------')
    print('%18s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))

    if draw_parses is None:
        # Ask the user if we should draw the parses.
        print()
        print('Draw parses (y/n)? ', end=' ')
        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
    if draw_parses:
        from nltk.draw.tree import draw_trees
        print('  please wait...')
        draw_trees(*parses)

    if print_parses is None:
        # Ask the user if we should print the parses.
        print()
        print('Print parses (y/n)? ', end=' ')
        print_parses = sys.stdin.readline().strip().lower().startswith('y')
    if print_parses:
        for parse in parses:
            print(parse)
Example #13
0
    def test_ambiguo(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                VP -> Vt NP             [0.3]
                VP -> VP PP             [0.7]
                NP -> NP PP             [0.6]
                NP -> DT NN             [0.4]
                PP -> IN NP             [1.0]
                Vt -> 'saw'             [1.0]
                NN -> 'man'             [0.33]
                NN -> 'telescope'       [0.33]
                NN -> 'dog'             [0.34]
                DT -> 'the'             [1.0]
                IN -> 'with'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog with the telescope'.split())

        pi = {
            (1, 1): {
                'DT': 0.0
            },
            (2, 2): {
                'NN': -1.5994620704162712
            },
            (3, 3): {
                'Vt': 0.0
            },
            (4, 4): {
                'DT': 0.0
            },
            (5, 5): {
                'NN': -1.5563933485243853
            },
            (6, 6): {
                'IN': 0.0
            },
            (7, 7): {
                'DT': 0.0
            },
            (8, 8): {
                'NN': -1.5994620704162712
            },
            (1, 2): {
                'NP': -2.9213901653036336
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': -2.8783214434117474
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': -2.9213901653036336
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': -4.6152870375779536
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP': -2.9213901653036336
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S': -7.536677202881587
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP': -6.536677202881587
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP': -8.051250375711346
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S': -10.972640541014979
            }
        }

        self.assertEqualPi(parser._pi, pi)

        t2 = Tree.fromstring("""
            (S
              (NP (DT the) (NN man))
              (VP
                (VP (Vt saw) (NP (DT the) (NN dog)))
                (PP (IN with) (NP (DT the) (NN telescope)))))
            """)

        self.assertEqual(t, t2)
Example #14
0
grammar = PCFG.fromstring("""
    S -> negside eqside [0.5]
    S -> side eqside [0.5]
    digit -> '0' [0.1]
    digit -> '1' [0.1]
    digit -> '2' [0.1]
    digit -> '3' [0.1]
    digit -> '4' [0.1]
    digit -> '5' [0.1]
    digit -> '6' [0.1]
    digit -> '7' [0.1]
    digit -> '8' [0.1]
    digit -> '9' [0.1]
    div -> '/' [1.0]
    divnum -> div number [1.0]
    divnumvar -> divnum variable [1.0]
    dot -> '.' [1.0]
    eq -> '=' [1.0]
    eqside -> eq negside [0.5]
    eqside -> eq side [0.5]
    lparen -> '(' [1.0]
    minus -> '-' [1.0]
    minusterm -> minus term [1.0]
    negside -> minus side [1.0]
    number -> '0' [.05]
    number -> '1' [.05]
    number -> '2' [.05]
    number -> '3' [.05]
    number -> '4' [.05]
    number -> '5' [.05]
    number -> '6' [.05]
    number -> '7' [.05]
    number -> '8' [.05]
    number -> '9' [.05]
    number -> digit number [.05]
    number -> dot number [0.45]
    parenside -> lparen siderparen [1.0]
    plus -> '+' [1.0]
    plusterm -> plus term [1.0]
    rparen -> ')' [1.0]
    side -> '0' [.04]
    side -> '1' [.04]
    side -> '2' [.04]
    side -> '3' [.04]
    side -> '4' [.04]
    side -> '5' [.04]
    side -> '6' [.04]
    side -> '7' [.04]
    side -> '8' [.04]
    side -> '9' [.04]
    side -> 'x' [.04]
    side -> digit number [.04]
    side -> dot number [.04]
    side -> number divnum [.04]
    side -> number divnumvar [.04]
    side -> number parenside [.04]
    side -> number starnum [.04]
    side -> number vardivnum [.04]
    side -> number variable [.04]
    side -> parenside divnum [.04]
    side -> side minusterm [.04]
    side -> side plusterm [.16]
    siderparen -> negside rparen [0.5]
    siderparen -> side rparen [0.5]
    star -> '*' [1.0]
    starnum -> star number [1.0]
    term -> '0' [.05]
    term -> '1' [.05]
    term -> '2' [.05]
    term -> '3' [.05]
    term -> '4' [.05]
    term -> '5' [.05]
    term -> '6' [.05]
    term -> '7' [.05]
    term -> '8' [.05]
    term -> '9' [.05]
    term -> 'x' [.05]
    term -> digit number [.05]
    term -> dot number [.05]
    term -> number divnum [.05]
    term -> number divnumvar [.05]
    term -> number parenside [.05]
    term -> number starnum [.05]
    term -> number vardivnum [.05]
    term -> number variable [.05]
    term -> parenside divnum [.05]
    vardivnum -> variable divnum [1.0]
    variable -> 'x' [1.0]
    varstarnum -> variable starnum [1.0]
""")
import nltk
from nltk.corpus import treebank
from itertools import islice
from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
gram2 = PCFG.fromstring("""
	A -> B B [.3] | C B C [.7]
	B -> B D [.5] | C [.5]
	C -> 'a' [.1] | 'b' [0.9]
	D -> 'b' [1.0]
	""")
prod1 = gram2.productions()[0]
print(prod1)
prod2 = gram2.productions()[1]
print(prod2)
print(prod2.lhs())
print(prod2.rhs())
print((prod2.prob()))
print(gram2.start())
print(gram2.productions())
Example #16
0
    def test_parse(self):
        grammar = PCFG.fromstring(
            """
                S -> NP VP              [1.0]
                NP -> Det Noun          [0.6]
                NP -> Noun Adj          [0.4]
                VP -> Verb NP           [1.0]
                Det -> 'el'             [1.0]
                Noun -> 'gato'          [0.9]
                Noun -> 'pescado'       [0.1]
                Verb -> 'come'          [1.0]
                Adj -> 'crudo'          [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('el gato come pescado crudo'.split())

        # check chart
        pi = {
            (1, 1): {'Det': log2(1.0)},
            (2, 2): {'Noun': log2(0.9)},
            (3, 3): {'Verb': log2(1.0)},
            (4, 4): {'Noun': log2(0.1)},
            (5, 5): {'Adj': log2(1.0)},

            (1, 2): {'NP': log2(0.6 * 1.0 * 0.9)},
            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': log2(0.4 * 0.1 * 1.0)},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)},

            (1, 4): {},
            (2, 5): {},

            (1, 5): {'S':
                     log2(1.0) +  # rule S -> NP VP
                     log2(0.6 * 1.0 * 0.9) +  # left part
                     log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)},  # right part
        }
        self.assertEqualPi(parser._pi, pi)

        # check partial results
        bp = {
            (1, 1): {'Det': Tree.fromstring("(Det el)")},
            (2, 2): {'Noun': Tree.fromstring("(Noun gato)")},
            (3, 3): {'Verb': Tree.fromstring("(Verb come)")},
            (4, 4): {'Noun': Tree.fromstring("(Noun pescado)")},
            (5, 5): {'Adj': Tree.fromstring("(Adj crudo)")},

            (1, 2): {'NP': Tree.fromstring("(NP (Det el) (Noun gato))")},
            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))")},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': Tree.fromstring(
                "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))")},

            (1, 4): {},
            (2, 5): {},

            (1, 5): {'S': Tree.fromstring(
                """(S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                   )
                """)},
        }
        self.assertEqual(parser._bp, bp)

        # check tree
        t2 = Tree.fromstring(
            """
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)
        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0)
        self.assertAlmostEqual(lp, lp2)
from nltk.grammar import Nonterminal, PCFG
from pcfg_generate import *


grammar = """
    S -> 'a' V 'b' [0.25] | 'b' V 'a' [0.25] | 'a' W 'a' [0.125] | 'a' W 'b' [0.125] | 'b' W 'a' [0.125] | 'b' W 'b' [0.125]
    V -> 'v0' [0.2] | 'v1' [0.2] | 'v2' [0.2] | 'v3' [0.2] | 'v4' [0.2]
    W -> 'w0' [0.2] | 'w1' [0.2] | 'w2' [0.2] | 'w3' [0.2] | 'w4' [0.2]
"""
print(grammar)

nonconflation = PCFG.fromstring(grammar)

print(nonconflation)

"""
texto = ""
for sentence in pcfg_generate(nonconflation, n=100000, depth=6):
    #print(' '.join(sentence))
    #print(sentence)
    str1 = ' '.join(sentence)
    texto = texto  + str1 + '\n'
"""

with open('nonconflation.txt', 'w') as f:
    for sentence in pcfg_generate(nonconflation, n=100000, depth=100):
       f.write(' '.join(sentence) +'\n')


Example #18
0
def loadGrammar(args):
    with open(args.grammar_file, 'r') as f:
        pcfg = PCFG.fromstring(f.read())
    return pcfg
Example #19
0
    def test_parse_ambiguity(self):
        # Ejemplo tomado de las paginas 4, 5, 8 de las notas de Michael Collins
        # Probabilistic Context-Free Grammars (PCFGs)
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]

                VP -> Vt NP             [0.65]
                VP -> VP PP             [0.35]

                NP -> DT NN             [0.8]
                NP -> NP PP             [0.2]

                PP -> IN NP             [1.0]

                Vt -> saw               [1.0]

                NN -> man               [0.2]
                NN -> telescope         [0.3]
                NN -> dog               [0.5]

                DT -> the               [1.0]

                IN -> with              [1.0]
            """)

        # Cambiando esto:
        # VP -> Vt NP             [0.85]
        # VP -> VP PP             [0.15]
        # Obtengo el otro arbol

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog with the telescope'.split())

        # draw_trees(t)

        # check tree
        t2 = Tree.fromstring("""
                    (S
                        (NP
                            (DT the)
                            (NN man)
                        )
                        (VP
                            (VP
                                (Vt saw)
                                (NP
                                    (DT the)
                                    (NN dog)
                                )
                            )
                            (PP
                                (IN with)
                                (NP
                                    (DT the)
                                    (NN telescope)
                                )
                            )
                        )
                    )
                """)

        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.8 * 1.0 * 0.2 * 0.35 * 0.65 * 1.0 * 0.8 * 1.0 *
                   0.5 * 1.0 * 1.0 * 0.8 * 1.0 * 0.3)

        self.assertAlmostEqual(lp, lp2)
Example #20
0
grammar = PCFG.fromstring("""
    S -> side eqside [0.766312]
    side -> side plusterm [0.215816]
    side -> side minusterm [0.204728]
    side -> number variable [0.192571]
    number -> digit number [0.2953]
    digit -> '2' [0.20094]
    number -> '8' [0.0512296]
    variable -> 'x' [1.0]
    minusterm -> minus term [1.0]
    minus -> '-' [1.0]
    term -> digit number [0.185462]
    digit -> '6' [0.062679]
    number -> '0' [0.0918352]
    plusterm -> plus term [1.0]
    plus -> '+' [1.0]
    term -> number divnum [0.205817]
    number -> '2' [0.114112]
    divnum -> div number [1.0]
    div -> '/' [1.0]
    number -> '5' [0.0972758]
    eqside -> eq negside [0.402479]
    eq -> '=' [1.0]
    negside -> minus side [1.0]
    digit -> '1' [0.356414]
    S -> negside eqside [0.233688]
    side -> number divnum [0.0921555]
    digit -> '7' [0.0299559]
    number -> '3' [0.0644647]
    eqside -> eq side [0.597521]
    side -> 'x' [0.0759721]
    side -> '6' [0.00732374]
    digit -> '9' [0.0175681]
    number -> '6' [0.0557699]
    number -> '4' [0.0737897]
    number -> '7' [0.0733781]
    digit -> '3' [0.150809]
    term -> number variable [0.314671]
    side -> number divnumvar [0.0415636]
    divnumvar -> divnum variable [1.0]
    side -> digit number [0.074099]
    number -> dot number [0.0103797]
    dot -> '.' [1.0]
    number -> '1' [0.0461748]
    term -> 'x' [0.032113]
    digit -> '8' [0.0327874]
    digit -> '4' [0.0841404]
    digit -> '5' [0.0397696]
    number -> '9' [0.0262901]
    side -> '1' [0.0088784]
    side -> number parenside [0.0178505]
    parenside -> lparen siderparen [1.0]
    lparen -> '(' [1.0]
    siderparen -> side rparen [0.841012]
    term -> '5' [0.0212008]
    rparen -> ')' [1.0]
    term -> number parenside [0.0322466]
    term -> '3' [0.00984322]
    side -> parenside divnum [0.00844759]
    term -> parenside divnum [0.00383039]
    digit -> '0' [0.0249365]
    side -> '3' [0.00307185]
    term -> '4' [0.0399964]
    term -> '9' [0.0138963]
    term -> '1' [0.0294406]
    side -> '5' [0.00533828]
    term -> '8' [0.0189293]
    side -> '4' [0.00842886]
    side -> '2' [0.0126808]
    term -> '2' [0.035008]
    term -> number divnumvar [0.0251203]
    side -> '0' [0.0074174]
    side -> number vardivnum [0.00829774]
    vardivnum -> variable divnum [1.0]
    term -> number vardivnum [0.00944237]
    side -> dot number [0.0017045]
    side -> number starnum [0.00129243]
    starnum -> star number [1.0]
    star -> '*' [1.0]
    side -> '9' [0.00340901]
    term -> '7' [0.0130946]
    siderparen -> negside rparen [0.158988]
    term -> dot number [0.00218243]
    term -> number starnum [0.0014698]
    term -> '0' [0.00080171]
    side -> '8' [0.00681801]
    side -> '7' [0.00213531]
    term -> '6' [0.00543381]
""")
Example #21
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys
    import time

    from nltk import tokenize
    from nltk.grammar import PCFG
    from nltk.parse import ViterbiParser

    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw the man with my telescope", toy_pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope",
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print(f"{i + 1:>3}: {demos[i][0]}")
        print("     %r" % demos[i][1])
        print()
    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print("Bad sentence number")
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print("Time (secs)   # Parses   Average P(parse)")
    print("-----------------------------------------")
    print("%11.4f%11d%19.14f" % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print("------------------------------------------")
    print("%11s%11d%19.14f" % ("n/a", len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print("Draw parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        from nltk.draw.tree import draw_trees

        print("  please wait...")
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print("Print parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print(parse)
Example #22
0
def demo(choice=None, draw_parses=None, print_parses=None):
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    if choice is None:
        # Ask the user which demo they want to use.
        print()
        for i in range(len(demos)):
            print('%3s: %s' % (i + 1, demos[i][0]))
            print('     %r' % demos[i][1])
            print()
        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
        choice = int(sys.stdin.readline().strip()) - 1
    try:
        sent, grammar = demos[choice]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar,
                                 beam_size=len(tokens) + 1)  # was BeamParser
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
        parser.trace(3)
        t = time.time()
        parses = list(parser.parse(tokens))
        times.append(time.time() - t)
        p = (reduce(lambda a, b: a + b.prob(), parses, 0) /
             len(parses) if parses else 0)
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print(
        '       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
    print(
        '------------------------+------------------------------------------')
    for i in range(len(parsers)):
        print('%18s %4d |%11.4f%11d%19.14f' %
              (parsers[i].__class__.__name__, parsers[i].beam_size, times[i],
               num_parses[i], average_p[i]))
    parses = all_parses.keys()
    if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else: p = 0
    print(
        '------------------------+------------------------------------------')
    print('%18s      |%11s%11d%19.14f' %
          ('(All Parses)', 'n/a', len(parses), p))

    if draw_parses is None:
        # Ask the user if we should draw the parses.
        print()
        print('Draw parses (y/n)? ', end=' ')
        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
    if draw_parses:
        from nltk.draw.tree import draw_trees
        print('  please wait...')
        draw_trees(*parses)

    if print_parses is None:
        # Ask the user if we should print the parses.
        print()
        print('Print parses (y/n)? ', end=' ')
        print_parses = sys.stdin.readline().strip().lower().startswith('y')
    if print_parses:
        for parse in parses:
            print(parse)
Example #23
0
            'Usage: %s grammar_file csv_file skiprows nrows (-1 for all) top_K'
            % sys.argv[0])
        print(
            'Example: %s grammar/airbnb_grammar.txt ../data/Airbnb/SanFrancisco_details.csv 0 3 20'
            % sys.argv[0])
        exit(0)
    grammar_txt = open(sys.argv[1]).read()
    csv_file = sys.argv[2]
    skiprows = int(sys.argv[3])
    nrows = int(sys.argv[4])
    if nrows == -1:
        nrows = None
    top_K = int(sys.argv[5])

    # print('Grammar:\n' + grammar_txt + '\n')
    grammar = PCFG.fromstring(grammar_txt)

    # Read CSV file into Pandas DataFrame
    # Handle DtypeWarning: Columns (43) have mixed types. [One entry with zipcode '94107-1273']
    # Fix dollar sign and thousands separators in 'price'
    df = pd.read_csv(csv_file,
                     skiprows=skiprows,
                     nrows=nrows,
                     dtype={
                         'host_id': np.str,
                         'zipcode': np.str
                     },
                     converters={
                         'price':
                         lambda s: float(s.replace('$', '').replace(',', ''))
                     })