def demo(): """ A demonstration showing how PCFG C{Grammar}s can be created and used. """ from en.parser.nltk_lite.corpora import treebank, extract from en.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms from itertools import islice # Create some probabilistic CFG Productions S, A, B, C = cfg.nonterminals('S A B C') pcfg_prods = [ pcfg.Production(A, [B, B], prob=0.3), pcfg.Production(A, [C, B, C], prob=0.7), pcfg.Production(B, [B, 'b'], prob=0.5), pcfg.Production(B, [C], prob=0.5), pcfg.Production(C, ['a'], prob=0.1), pcfg.Production(C, ['b'], prob=0.9) ] pcfg_prod = pcfg_prods[2] print('A PCFG production:', repr(pcfg_prod)) print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs())) print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs())) print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob())) print() # Create and print a PCFG grammar = pcfg.Grammar(S, pcfg_prods) print('A PCFG grammar:', repr(grammar)) print(' grammar.start() =>', repr(grammar.start())) print(' grammar.productions() =>', end=' ') # Use str.replace(...) is to line-wrap the output. print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26)) print() # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] for tree in islice(treebank.parsed(), 3): # perform optional in-place tree transformations, e.g.: # treetransforms.collapseUnary(tree, collapsePOS = False) # treetransforms.chomskyNormalForm(tree, horzMarkov = 2) productions += tree.productions() grammar = pcfg.induce(S, productions) print(grammar) print() print("Parse sentence using induced grammar:") parser = pchart.InsideParse(grammar) parser.trace(3) sent = extract(0, treebank.raw()) print(sent) for parse in parser.get_parse_list(sent): print(parse)
def demo(): """ A demonstration showing how PCFG C{Grammar}s can be created and used. """ from en.parser.nltk_lite.corpora import treebank, extract from en.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms from itertools import islice # Create some probabilistic CFG Productions S, A, B, C = cfg.nonterminals("S A B C") pcfg_prods = [ pcfg.Production(A, [B, B], prob=0.3), pcfg.Production(A, [C, B, C], prob=0.7), pcfg.Production(B, [B, "b"], prob=0.5), pcfg.Production(B, [C], prob=0.5), pcfg.Production(C, ["a"], prob=0.1), pcfg.Production(C, ["b"], prob=0.9), ] pcfg_prod = pcfg_prods[2] print "A PCFG production:", ` pcfg_prod ` print " pcfg_prod.lhs() =>", ` pcfg_prod.lhs() ` print " pcfg_prod.rhs() =>", ` pcfg_prod.rhs() ` print " pcfg_prod.prob() =>", ` pcfg_prod.prob() ` print # Create and print a PCFG grammar = pcfg.Grammar(S, pcfg_prods) print "A PCFG grammar:", ` grammar ` print " grammar.start() =>", ` grammar.start() ` print " grammar.productions() =>", # Use string.replace(...) is to line-wrap the output. print ` grammar.productions() `.replace(",", ",\n" + " " * 26) print # extract productions from three trees and induce the PCFG print "Induce PCFG grammar from treebank data:" productions = [] for tree in islice(treebank.parsed(), 3): # perform optional in-place tree transformations, e.g.: # treetransforms.collapseUnary(tree, collapsePOS = False) # treetransforms.chomskyNormalForm(tree, horzMarkov = 2) productions += tree.productions() grammar = pcfg.induce(S, productions) print grammar print print "Parse sentence using induced grammar:" parser = pchart.InsideParse(grammar) parser.trace(3) sent = extract(0, treebank.raw()) print sent for parse in parser.get_parse_list(sent): print parse
def demo(): """ A demonstration showing how C{Grammar}s can be created and used. """ from en.parser.nltk_lite.parse import cfg # Create some nonterminals S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') N, V, P, Det = cfg.nonterminals('N, V, P, Det') VP_slash_NP = VP / NP print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP] print ' S.symbol() =>', ` S.symbol() ` print print cfg.Production(S, [NP]) # Create some Grammar Productions grammar = cfg.parse_grammar(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' N -> 'dog' N -> 'cat' V -> 'chased' V -> 'sat' P -> 'on' P -> 'in' """) print 'A Grammar:', ` grammar ` print ' grammar.start() =>', ` grammar.start() ` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print ` grammar.productions() `.replace(',', ',\n' + ' ' * 25) print
def demo(): """ A demonstration of the recursive descent parser. """ from en.parser.nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, 'saw', NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['telescope'])) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace('I saw a man in the park')) # Define a list of parsers. parser = RecursiveDescent(grammar) parser.trace() for p in parser.get_parse_list(sent): print p
def demo(): """ A demonstration showing how C{Grammar}s can be created and used. """ from en.parser.nltk_lite.parse import cfg # Create some nonterminals S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') N, V, P, Det = cfg.nonterminals('N, V, P, Det') VP_slash_NP = VP/NP print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP] print ' S.symbol() =>', `S.symbol()` print print cfg.Production(S, [NP]) # Create some Grammar Productions grammar = cfg.parse_grammar(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' N -> 'dog' N -> 'cat' V -> 'chased' V -> 'sat' P -> 'on' P -> 'in' """) print 'A Grammar:', `grammar` print ' grammar.start() =>', `grammar.start()` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print `grammar.productions()`.replace(',', ',\n'+' '*25) print
def demo(): """ A demonstration of the shift-reduce parser. """ from en.parser.nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals("S, VP, NP, PP") V, N, P, Name, Det = cfg.nonterminals("V, N, P, Name, Det") # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, "saw", NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ["I"]), cfg.Production(Det, ["the"]), cfg.Production(Det, ["a"]), cfg.Production(N, ["man"]), cfg.Production(V, ["saw"]), cfg.Production(P, ["in"]), cfg.Production(P, ["with"]), cfg.Production(N, ["park"]), cfg.Production(N, ["dog"]), cfg.Production(N, ["telescope"]), ) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace("I saw a man in the park")) parser = ShiftReduce(grammar) parser.trace() for p in parser.get_parse_list(sent): print p
def demo(): """ A demonstration of the recursive descent parser. """ from en.parser.nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, 'saw', NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['telescope']) ) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace('I saw a man in the park')) # Define a list of parsers. parser = RecursiveDescent(grammar) parser.trace() for p in parser.get_parse_list(sent): print p
lcount = {} # LHS-count: counts the number of times a given lhs occurs for prod in productions: lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 pcount[prod] = pcount.get(prod, 0) + 1 prods = [Production(p.lhs(), p.rhs(), prob=float(pcount[p]) / lcount[p.lhs()])\ for p in pcount] return Grammar(start, prods) ################################################################# # Toy PCFGs ################################################################# _S, _VP, _NP, _PP = cfg.nonterminals('S, VP, NP, PP') _V, _N, _P, _Name, _Det = cfg.nonterminals('V, N, P, Name, Det') toy1 = Grammar(_S, [ Production(_NP, [_Det, _N], prob=0.5), Production(_NP, [_NP, _PP], prob=0.25), Production(_NP, ['John'], prob=0.1), Production(_NP, ['I'], prob=0.15), Production(_Det, ['the'], prob=0.8), Production(_Det, ['my'], prob=0.2), Production(_N, ['dog'], prob=0.5), Production(_N, ['cookie'], prob=0.5), Production(_VP, [_VP, _PP], prob=0.1), Production(_VP, [_V, _NP], prob=0.7), Production(_VP, [_V], prob=0.2), Production(_V, ['ate'], prob=0.35),
pcount = {} # Production count: the number of times a given production occurs lcount = {} # LHS-count: counts the number of times a given lhs occurs for prod in productions: lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 pcount[prod] = pcount.get(prod, 0) + 1 prods = [Production(p.lhs(), p.rhs(), prob=float(pcount[p]) / lcount[p.lhs()]) for p in pcount] return Grammar(start, prods) ################################################################# # Toy PCFGs ################################################################# _S, _VP, _NP, _PP = cfg.nonterminals("S, VP, NP, PP") _V, _N, _P, _Name, _Det = cfg.nonterminals("V, N, P, Name, Det") toy1 = Grammar( _S, [ Production(_NP, [_Det, _N], prob=0.5), Production(_NP, [_NP, _PP], prob=0.25), Production(_NP, ["John"], prob=0.1), Production(_NP, ["I"], prob=0.15), Production(_Det, ["the"], prob=0.8), Production(_Det, ["my"], prob=0.2), Production(_N, ["dog"], prob=0.5), Production(_N, ["cookie"], prob=0.5), Production(_VP, [_VP, _PP], prob=0.1), Production(_VP, [_V, _NP], prob=0.7),
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk_lite context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk_lite.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string """ 1. Tokenize ------------------------------------------------------------------------ """ # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) """ 2. Develop a context free grammar ------------------------------------------------------------------------ """ # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O]) ) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D,[tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescent(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp(self.string, re_all) toklist = list(tokens) """ 3. Parse using the context free grammar ------------------------------------------------------------------------ """ # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return """ 4. Refine and convert to a Tree representation ------------------------------------------------------------------------ """ # Set the nltk_lite.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk_lite.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk_lite context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk_lite.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string """ 1. Tokenize ------------------------------------------------------------------------ """ # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) """ 2. Develop a context free grammar ------------------------------------------------------------------------ """ # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescent(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp(self.string, re_all) toklist = list(tokens) """ 3. Parse using the context free grammar ------------------------------------------------------------------------ """ # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return """ 4. Refine and convert to a Tree representation ------------------------------------------------------------------------ """ # Set the nltk_lite.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk_lite.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()