def demo(): """ A demonstration showing how C{Grammar}s can be created and used. """ from nltk import cfg # Create some nonterminals S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') N, V, P, Det = cfg.nonterminals('N, V, P, Det') VP_slash_NP = VP/NP print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP] print ' S.symbol() =>', `S.symbol()` print print cfg.Production(S, [NP]) # Create some Grammar Productions grammar = cfg.parse_grammar(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' N -> 'dog' N -> 'cat' V -> 'chased' V -> 'sat' P -> 'on' P -> 'in' """) print 'A Grammar:', `grammar` print ' grammar.start() =>', `grammar.start()` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print `grammar.productions()`.replace(',', ',\n'+' '*25) print
def demo(): """ A demonstration showing how C{Grammar}s can be created and used. """ from nltk import cfg # Create some nonterminals S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') N, V, P, Det = cfg.nonterminals('N, V, P, Det') VP_slash_NP = VP / NP print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP] print ' S.symbol() =>', ` S.symbol() ` print print cfg.Production(S, [NP]) # Create some Grammar Productions grammar = cfg.parse_grammar(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' N -> 'dog' N -> 'cat' V -> 'chased' V -> 'sat' P -> 'on' P -> 'in' """) print 'A Grammar:', ` grammar ` print ' grammar.start() =>', ` grammar.start() ` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print ` grammar.productions() `.replace(',', ',\n' + ' ' * 25) print
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk.tokenizer import WhitespaceTokenizer # Define some nonterminals S, VP, NP, PP = nonterminals("S, VP, NP, PP") V, N, P, Name, Det = nonterminals("V, N, P, Name, Det") # Define a PCFG grammar_productions1 = [ PCFGProduction(NP, [Det, N], prob=0.5), PCFGProduction(NP, [NP, PP], prob=0.25), PCFGProduction(NP, ["John"], prob=0.1), PCFGProduction(NP, ["I"], prob=0.15), PCFGProduction(Det, ["the"], prob=0.8), PCFGProduction(Det, ["my"], prob=0.2), PCFGProduction(N, ["dog"], prob=0.5), PCFGProduction(N, ["cookie"], prob=0.5), PCFGProduction(VP, [VP, PP], prob=0.1), PCFGProduction(VP, [V, NP], prob=0.7), PCFGProduction(VP, [V], prob=0.2), PCFGProduction(V, ["ate"], prob=0.35), PCFGProduction(V, ["saw"], prob=0.65), PCFGProduction(S, [NP, VP], prob=1.0), PCFGProduction(PP, [P, NP], prob=1.0), PCFGProduction(P, ["with"], prob=0.61), PCFGProduction(P, ["under"], prob=0.39), ] pcfg1 = PCFG(S, grammar_productions1) # Define a second, more extensive, grammar. lexicon = [ PCFGProduction(V, ["saw"], prob=0.21), PCFGProduction(V, ["ate"], prob=0.51), PCFGProduction(V, ["ran"], prob=0.28), PCFGProduction(N, ["boy"], prob=0.11), PCFGProduction(N, ["cookie"], prob=0.12), PCFGProduction(N, ["table"], prob=0.13), PCFGProduction(N, ["telescope"], prob=0.14), PCFGProduction(N, ["hill"], prob=0.50), PCFGProduction(Name, ["Jack"], prob=0.52), PCFGProduction(Name, ["Bob"], prob=0.48), PCFGProduction(P, ["with"], prob=0.61), PCFGProduction(P, ["under"], prob=0.39), PCFGProduction(Det, ["the"], prob=0.41), PCFGProduction(Det, ["a"], prob=0.31), PCFGProduction(Det, ["my"], prob=0.28), ] grammar_productions2 = lexicon + [ PCFGProduction(S, [NP, VP], prob=1.00), PCFGProduction(VP, [V, NP], prob=0.59), PCFGProduction(VP, [V], prob=0.40), PCFGProduction(VP, [VP, PP], prob=0.01), PCFGProduction(NP, [Det, N], prob=0.41), PCFGProduction(NP, [Name], prob=0.28), PCFGProduction(NP, [NP, PP], prob=0.31), PCFGProduction(PP, [P, NP], prob=1.00), ] pcfg2 = PCFG(S, grammar_productions2) # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw John with my cookie", pcfg1), ("the boy saw Jack with Bob under the table with a telescope", pcfg2), ] # Ask the user which demo they want to use. print for i in range(len(demos)): print "%3s: %s" % (i + 1, demos[i][0]) print " %r" % demos[i][1] print print "Which demo (%d-%d)? " % (1, len(demos)), try: snum = int(sys.stdin.readline().strip()) - 1 s, pcfg = demos[snum] except: print "Bad sentence number" return # Tokenize the sentence. token = Token(TEXT=s) WhitespaceTokenizer(SUBTOKENS="WORDS").tokenize(token, add_locs=True) # Define a list of parsers. We'll use all parsers. parsers = [ ViterbiPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"), InsidePCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"), RandomPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"), UnsortedPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"), LongestPCFGParser(pcfg, LEAF="TEXT", SUBTOKENS="WORDS"), BeamPCFGParser(len(token["WORDS"]) + 1, pcfg, LEAF="TEXT", SUBTOKENS="WORDS"), ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print "\ns: %s\nparser: %s\ngrammar: %s" % (S, parser, pcfg) parser.trace(3) t = time.time() parses = parser.get_parse_list(token) times.append(time.time() - t) if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze(lambda t: t.freeze())] = 1 # Print some summary statistics print print " Parser | Time (secs) # Parses Average P(parse)" print "-------------------+------------------------------------------" for i in range(len(parsers)): print "%18s |%11.4f%11d%19.14f" % (parsers[i].__class__.__name__, times[i], num_parses[i], average_p[i]) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print "-------------------+------------------------------------------" print "%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p) # Ask the user if we should draw the parses. print print "Draw parses (y/n)? ", if sys.stdin.readline().strip().lower().startswith("y"): import nltk.draw.tree print " please wait..." nltk.draw.tree.draw_trees(*parses) # Ask the user if we should print the parses. print print "Print parses (y/n)? ", if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print parse
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O]) ) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D,[tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()