def demo3(): from nltk import Production (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals( 'S, VP, NP, PP, P, N, Name, V, Det' ) productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ['up', 'over', NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) t = Tk() def destroy(e, t=t): t.destroy() t.bind('q', destroy) p = ProductionList(t, productions) p.pack(expand=1, fill='both') p.add_callback('select', p.markonly) p.add_callback('move', p.markonly) p.focus() p.mark(productions[2]) p.mark(productions[8])
def demo3(): from nltk import Production (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals( "S, VP, NP, PP, P, N, Name, V, Det" ) productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ["up", "over", NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) t = Tk() def destroy(e, t=t): t.destroy() t.bind("q", destroy) p = ProductionList(t, productions) p.pack(expand=1, fill="both") p.add_callback("select", p.markonly) p.add_callback("move", p.markonly) p.focus() p.mark(productions[2]) p.mark(productions[8])
def demo3(): from nltk import Production (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals("S, VP, NP, PP, P, N, Name, V, Det") productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ["up", "over", NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) t = Tk() def destroy(e, t=t): t.destroy() t.bind("q", destroy) p = ProductionList(t, productions) p.pack(expand=1, fill="both") p.add_callback("select", p.markonly) p.add_callback("move", p.markonly) p.focus() p.mark(productions[2]) p.mark(productions[8])
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp_tokenize(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain Production(O, [D]), Production(O, [H]), Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain Production(H, [D, '/', D]), # domain, forward slash, another operator Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = ContextFreeGrammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp_tokenize(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain Production(O, [D]), Production(O, [H]), Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain Production(H, [D, '/', D]), # domain, forward slash, another operator Production(H, [D, '/', O]) ) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = Production(D,[tok]), productions = productions + prod # Make a grammar out of our productions grammar = ContextFreeGrammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()