Beispiel #1
0
def demo():
    """
    A demonstration showing how C{Grammar}s can be created and used.
    """

    from nltk import cfg

    # Create some nonterminals
    S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP')
    N, V, P, Det = cfg.nonterminals('N, V, P, Det')
    VP_slash_NP = VP/NP

    print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP]
    print '    S.symbol() =>', `S.symbol()`
    print

    print cfg.Production(S, [NP])

    # Create some Grammar Productions
    grammar = cfg.parse_grammar("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    N -> 'dog'
    N -> 'cat'
    V -> 'chased'
    V -> 'sat'
    P -> 'on'
    P -> 'in'
    """)

    print 'A Grammar:', `grammar`
    print '    grammar.start()       =>', `grammar.start()`
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print `grammar.productions()`.replace(',', ',\n'+' '*25)
    print
Beispiel #2
0
def demo():
    """
    A demonstration showing how C{Grammar}s can be created and used.
    """

    from nltk import cfg

    # Create some nonterminals
    S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP')
    N, V, P, Det = cfg.nonterminals('N, V, P, Det')
    VP_slash_NP = VP / NP

    print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP]
    print '    S.symbol() =>', ` S.symbol() `
    print

    print cfg.Production(S, [NP])

    # Create some Grammar Productions
    grammar = cfg.parse_grammar("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    N -> 'dog'
    N -> 'cat'
    V -> 'chased'
    V -> 'sat'
    P -> 'on'
    P -> 'in'
    """)

    print 'A Grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 25)
    print
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp(self.string, re_all)

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = cfg.nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            cfg.Production(O, [D]),
            cfg.Production(O, [H]),
            cfg.Production(O, [T]),

            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")"
            cfg.Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            cfg.Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            cfg.Production(H, [D, '/', O]))

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$')
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = cfg.Production(D, [tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = cfg.Grammar(O, productions)
        rd_parser = parse.RecursiveDescentParser(grammar)

        # Tokens need to be redefined.
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp_tokenize(self.string, re_all)
        toklist = list(tokens)

        # Store the parsing.
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError:
            print "Could not parse query."
            return

        # Set the nltk.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":", "").replace("')'", "").replace(
            "table(", "").replace("','", "").replace("'", "").replace("/", "")
        self.nltktree = parse.tree.bracket_parse(string2)

        # Store the resulting nltk.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk.tokenizer import WhitespaceTokenizer

    # Define some nonterminals
    S, VP, NP, PP = nonterminals("S, VP, NP, PP")
    V, N, P, Name, Det = nonterminals("V, N, P, Name, Det")

    # Define a PCFG
    grammar_productions1 = [
        PCFGProduction(NP, [Det, N], prob=0.5),
        PCFGProduction(NP, [NP, PP], prob=0.25),
        PCFGProduction(NP, ["John"], prob=0.1),
        PCFGProduction(NP, ["I"], prob=0.15),
        PCFGProduction(Det, ["the"], prob=0.8),
        PCFGProduction(Det, ["my"], prob=0.2),
        PCFGProduction(N, ["dog"], prob=0.5),
        PCFGProduction(N, ["cookie"], prob=0.5),
        PCFGProduction(VP, [VP, PP], prob=0.1),
        PCFGProduction(VP, [V, NP], prob=0.7),
        PCFGProduction(VP, [V], prob=0.2),
        PCFGProduction(V, ["ate"], prob=0.35),
        PCFGProduction(V, ["saw"], prob=0.65),
        PCFGProduction(S, [NP, VP], prob=1.0),
        PCFGProduction(PP, [P, NP], prob=1.0),
        PCFGProduction(P, ["with"], prob=0.61),
        PCFGProduction(P, ["under"], prob=0.39),
    ]
    pcfg1 = PCFG(S, grammar_productions1)

    # Define a second, more extensive, grammar.
    lexicon = [
        PCFGProduction(V, ["saw"], prob=0.21),
        PCFGProduction(V, ["ate"], prob=0.51),
        PCFGProduction(V, ["ran"], prob=0.28),
        PCFGProduction(N, ["boy"], prob=0.11),
        PCFGProduction(N, ["cookie"], prob=0.12),
        PCFGProduction(N, ["table"], prob=0.13),
        PCFGProduction(N, ["telescope"], prob=0.14),
        PCFGProduction(N, ["hill"], prob=0.50),
        PCFGProduction(Name, ["Jack"], prob=0.52),
        PCFGProduction(Name, ["Bob"], prob=0.48),
        PCFGProduction(P, ["with"], prob=0.61),
        PCFGProduction(P, ["under"], prob=0.39),
        PCFGProduction(Det, ["the"], prob=0.41),
        PCFGProduction(Det, ["a"], prob=0.31),
        PCFGProduction(Det, ["my"], prob=0.28),
    ]
    grammar_productions2 = lexicon + [
        PCFGProduction(S, [NP, VP], prob=1.00),
        PCFGProduction(VP, [V, NP], prob=0.59),
        PCFGProduction(VP, [V], prob=0.40),
        PCFGProduction(VP, [VP, PP], prob=0.01),
        PCFGProduction(NP, [Det, N], prob=0.41),
        PCFGProduction(NP, [Name], prob=0.28),
        PCFGProduction(NP, [NP, PP], prob=0.31),
        PCFGProduction(PP, [P, NP], prob=1.00),
    ]
    pcfg2 = PCFG(S, grammar_productions2)

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw John with my cookie", pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope", pcfg2),
    ]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print "%3s: %s" % (i + 1, demos[i][0])
        print "     %r" % demos[i][1]
        print
    print "Which demo (%d-%d)? " % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        s, pcfg = demos[snum]
    except:
        print "Bad sentence number"
        return

    # Tokenize the sentence.
    token = Token(TEXT=s)
    WhitespaceTokenizer(SUBTOKENS="WORDS").tokenize(token, add_locs=True)

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        ViterbiPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"),
        InsidePCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"),
        RandomPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"),
        UnsortedPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"),
        LongestPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"),
        BeamPCFGParser(len(token["WORDS"]) + 1, pcfg, LEAF="TEXT", SUBTOKENS="WORDS"),
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print "\ns: %s\nparser: %s\ngrammar: %s" % (S, parser, pcfg)
        parser.trace(3)
        t = time.time()
        parses = parser.get_parse_list(token)
        times.append(time.time() - t)
        if parses:
            p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
        else:
            p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze(lambda t: t.freeze())] = 1

    # Print some summary statistics
    print
    print "       Parser      | Time (secs)   # Parses   Average P(parse)"
    print "-------------------+------------------------------------------"
    for i in range(len(parsers)):
        print "%18s |%11.4f%11d%19.14f" % (parsers[i].__class__.__name__, times[i], num_parses[i], average_p[i])
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print "-------------------+------------------------------------------"
    print "%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p)

    # Ask the user if we should draw the parses.
    print
    print "Draw parses (y/n)? ",
    if sys.stdin.readline().strip().lower().startswith("y"):
        import nltk.draw.tree

        print "  please wait..."
        nltk.draw.tree.draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print "Print parses (y/n)? ",
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print parse
Beispiel #5
0
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp(self.string, re_all)

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = cfg.nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]),
            
            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")" 
            cfg.Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            cfg.Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            cfg.Production(H, [D, '/', O])
        )

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$') 
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = cfg.Production(D,[tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = cfg.Grammar(O, productions)
        rd_parser = parse.RecursiveDescentParser(grammar)
       
        # Tokens need to be redefined. 
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp_tokenize(self.string, re_all)
        toklist = list(tokens)

        # Store the parsing. 
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError: 
            print "Could not parse query."
            return

        # Set the nltk.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","")
        self.nltktree = parse.tree.bracket_parse(string2)
        
        # Store the resulting nltk.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()