def demo(): """ A demonstration that shows the output of several different tokenizers on the same string. """ from en.parser.nltk_lite import tokenize # Define the test string. s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." print 'Input text:' print `s` print print 'Tokenize using whitespace:' _display(tokenize.whitespace(s)) print print 'Tokenize sequences of alphanumeric characters:' _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) print print 'Tokenize sequences of letters and sequences of nonletters:' _display(tokenize.wordpunct(s)) print print 'Tokenize by lines:' _display(tokenize.line(s)) print print 'Tokenize by blank lines:' _display(tokenize.blankline(s)) print print 'A simple sentence tokenizer:' _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) print
def demo(): """ A demonstration that shows the output of several different tokenizers on the same string. """ from en.parser.nltk_lite import tokenize # Define the test string. s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." print 'Input text:' print ` s ` print print 'Tokenize using whitespace:' _display(tokenize.whitespace(s)) print print 'Tokenize sequences of alphanumeric characters:' _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) print print 'Tokenize sequences of letters and sequences of nonletters:' _display(tokenize.wordpunct(s)) print print 'Tokenize by lines:' _display(tokenize.line(s)) print print 'Tokenize by blank lines:' _display(tokenize.blankline(s)) print print 'A simple sentence tokenizer:' _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) print
def demo(): import sys, time S = GrammarCategory.parse('S') VP = GrammarCategory.parse('VP') NP = GrammarCategory.parse('NP') PP = GrammarCategory.parse('PP') V = GrammarCategory.parse('V') N = GrammarCategory.parse('N') P = GrammarCategory.parse('P') Name = GrammarCategory.parse('Name') Det = GrammarCategory.parse('Det') DetSg = GrammarCategory.parse('Det[-pl]') DetPl = GrammarCategory.parse('Det[+pl]') NSg = GrammarCategory.parse('N[-pl]') NPl = GrammarCategory.parse('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V, )), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg)) ] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John', )), cfg.Production(NP, ('I', )), cfg.Production(Det, ('the', )), cfg.Production(Det, ('my', )), cfg.Production(Det, ('a', )), cfg.Production(NSg, ('dog', )), cfg.Production(NSg, ('cookie', )), cfg.Production(V, ('ate', )), cfg.Production(V, ('saw', )), cfg.Production(P, ('with', )), cfg.Production(P, ('under', )), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) sent = 'I saw John with a dog with my cookie' print("Sentence:\n", sent) from en.parser.nltk_lite import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1) trees = cp.get_parse_list(tokens) print("Time: %s" % (time.time() - t)) for tree in trees: print(tree)
def text_parse(inputs, grammar, trace=0): """ Convert input sentences into syntactic trees. """ parses = {} for sent in inputs: tokens = list(tokenize.whitespace(sent)) parser = grammar.earley_parser(trace=trace) syntrees = parser.get_parse_list(tokens) parses[sent] = syntrees return parses
def text_parse(grammar, sent, trace=2, drawtrees=False, latex=False): parser = grammar.earley_parser(trace=trace) print(parser._grammar) tokens = list(tokenize.whitespace(sent)) trees = parser.get_parse_list(tokens) if drawtrees: from treeview import TreeView TreeView(trees) else: for tree in trees: if latex: print(tree.latex_qtree()) else: print(tree)
def demo(): from en.parser.nltk_lite import tokenize, stem # Create a simple regular expression based stemmer stemmer = stem.Regexp('ing$|s$|e$', min=4) text = "John was eating icecream" tokens = tokenize.whitespace(text) # Print the results. print stemmer for word in tokens: print '%20s => %s' % (word, stemmer.stem(word)) print
def demo(): from en.parser.nltk_lite import tokenize, stem # Create a simple regular expression based stemmer stemmer = stem.Regexp('ing$|s$|e$', min=4) text = "John was eating icecream" tokens = tokenize.whitespace(text) # Print the results. print(stemmer) for word in tokens: print('%20s => %s' % (word, stemmer.stem(word))) print()
def demo(): import sys, time S = GrammarCategory.parse('S') VP = GrammarCategory.parse('VP') NP = GrammarCategory.parse('NP') PP = GrammarCategory.parse('PP') V = GrammarCategory.parse('V') N = GrammarCategory.parse('N') P = GrammarCategory.parse('P') Name = GrammarCategory.parse('Name') Det = GrammarCategory.parse('Det') DetSg = GrammarCategory.parse('Det[-pl]') DetPl = GrammarCategory.parse('Det[+pl]') NSg = GrammarCategory.parse('N[-pl]') NPl = GrammarCategory.parse('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg))] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)), cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)), cfg.Production(Det, ('a',)), cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)), cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)), cfg.Production(P, ('with',)), cfg.Production(P, ('under',)), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) sent = 'I saw John with a dog with my cookie' print "Sentence:\n", sent from en.parser.nltk_lite import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1) trees = cp.get_parse_list(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree
def raw(files='english-kjv'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "genesis", file + ".txt") s = open(path).read() for t in tokenize.whitespace(s): yield t
def raw(files = 'english-kjv'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "genesis", file+".txt") s = open(path).read() for t in tokenize.whitespace(s): yield t
def text_parse(grammar, sent, trace=2, drawtrees=False, latex=False): parser = grammar.earley_parser(trace=trace) print parser._grammar tokens = list(tokenize.whitespace(sent)) trees = parser.get_parse_list(tokens) if drawtrees: from treeview import TreeView TreeView(trees) else: for tree in trees: if latex: print tree.latex_qtree() else: print tree
def raw(files='raw'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "treebank", file) f = open(path).read() for sent in tokenize.blankline(f): l = [] for t in tokenize.whitespace(sent): l.append(t) yield l
def demo(): """ Create a shift reduce parser demo, using a simple grammar and text. """ from en.parser.nltk_lite.parse import cfg nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP, PP]), cfg.Production(VP, [V, NP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['statue']), cfg.Production(Det, ['my']), ) grammar = cfg.Grammar(S, productions) # tokenize the sentence sent = list( tokenize.whitespace('my dog saw a man in the park with a statue')) ShiftReduceDemo(grammar, sent).mainloop()
def demo(): """ A demonstration of the recursive descent parser. """ from en.parser.nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, 'saw', NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['telescope'])) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace('I saw a man in the park')) # Define a list of parsers. parser = RecursiveDescent(grammar) parser.trace() for p in parser.get_parse_list(sent): print p
def demo(): """ A demonstration of the shift-reduce parser. """ from en.parser.nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals("S, VP, NP, PP") V, N, P, Name, Det = cfg.nonterminals("V, N, P, Name, Det") # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, "saw", NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ["I"]), cfg.Production(Det, ["the"]), cfg.Production(Det, ["a"]), cfg.Production(N, ["man"]), cfg.Production(V, ["saw"]), cfg.Production(P, ["in"]), cfg.Production(P, ["with"]), cfg.Production(N, ["park"]), cfg.Production(N, ["dog"]), cfg.Production(N, ["telescope"]), ) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace("I saw a man in the park")) parser = ShiftReduce(grammar) parser.trace() for p in parser.get_parse_list(sent): print p
def demo(): """ Create a shift reduce parser demo, using a simple grammar and text. """ from en.parser.nltk_lite.parse import cfg nonterminals = "S VP NP PP P N Name V Det" (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP, PP]), cfg.Production(VP, [V, NP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ["I"]), cfg.Production(Det, ["the"]), cfg.Production(Det, ["a"]), cfg.Production(N, ["man"]), cfg.Production(V, ["saw"]), cfg.Production(P, ["in"]), cfg.Production(P, ["with"]), cfg.Production(N, ["park"]), cfg.Production(N, ["dog"]), cfg.Production(N, ["statue"]), cfg.Production(Det, ["my"]), ) grammar = cfg.Grammar(S, productions) # tokenize the sentence sent = list(tokenize.whitespace("my dog saw a man in the park with a statue")) ShiftReduceDemo(grammar, sent).mainloop()
def demo(): """ A demonstration of the recursive descent parser. """ from en.parser.nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, 'saw', NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['telescope']) ) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace('I saw a man in the park')) # Define a list of parsers. parser = RecursiveDescent(grammar) parser.trace() for p in parser.get_parse_list(sent): print p
def demo(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from en.parser.nltk_lite.parse import cfg grammar = cfg.parse_grammar(""" # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """) sent = list(tokenize.whitespace('the dog saw a man in the park')) RecursiveDescentDemo(grammar, sent).mainloop()
def set_sentence(self, sentence): self._sent = list(tokenize.whitespace(sentence)) # [XX] use tagged? self.reset()
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from en.parser.nltk_lite import tokenize from en.parser.nltk_lite.parse import cfg, pcfg, ViterbiParse # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw John with my cookie', pcfg.toy1), ('the boy saw Jack with Bob under the table with a telescope', pcfg.toy2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i + 1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = list(tokenize.whitespace(sent)) parser = ViterbiParse(grammar) all_parses = {} print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar) parser.trace(3) t = time.time() parses = parser.get_parse_list(tokens) time = time.time() - t if parses: average = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: average = 0 num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print 'Time (secs) # Parses Average P(parse)' print '-----------------------------------------' print '%11.4f%11d%19.14f' % (time, num_parses, average) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print '------------------------------------------' print '%11s%11d%19.14f' % ('n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from en.parser.nltk_lite.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from en.parser.nltk_lite import tokenize from en.parser.nltk_lite.parse import cfg, pcfg, pchart # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw John with my cookie', pcfg.toy1), ('the boy saw Jack with Bob under the table with a telescope', pcfg.toy2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i+1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip())-1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = list(tokenize.whitespace(sent)) # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideParse(grammar), pchart.RandomParse(grammar), pchart.UnsortedParse(grammar), pchart.LongestParse(grammar), pchart.BeamParse(len(tokens)+1, grammar) ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,pcfg) parser.trace(3) t = time.time() parses = parser.get_parse_list(tokens) times.append(time.time()-t) if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print ' Parser | Time (secs) # Parses Average P(parse)' print '-------------------+------------------------------------------' for i in range(len(parsers)): print '%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, times[i],num_parses[i],average_p[i]) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print '-------------------+------------------------------------------' print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from en.parser.nltk_lite.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade): # allow any kind of bracketing for flexibility L_BRACKET = re.compile(r'[\(\[\{<]') R_BRACKET = re.compile(r'[\)\]\}>]') if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for s in data: bracket = 0 itmType = None stack = [tree.Tree(top_node, [])] inTag = [] for itm in list(tokenize.whitespace(s)): if L_BRACKET.match(itm[0]): bracket += 1 itm = itm[1:] matched = False if partial_match == True: for eachItm in chunk_types: if (len(eachItm) <= len(itm) and eachItm == itm[:len(eachItm)]): matched = True if collapse_partials == True: itm = eachItm else: if (chunk_types is not None and itm in chunk_types): matched = True if matched == True: # and inTag == 0: chunk = tree.Tree(itm, []) if cascade == True: stack.append(chunk) inTag += [bracket] else: if len(inTag) == 0: stack[-1].append(chunk) inTag += [bracket] itmType=itm if R_BRACKET.match(itm[-1]): tmpItm = split(itm, itm[-1]) if tmpItm != "": if len(inTag) > 0 and inTag[-1] <= bracket: #inTag <= bracket: if cascade == True: stack[-1].append( (itmType, tmpItm[0]) ) else: stack[-1][-1].append( (itmType, tmpItm[0]) ) else: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] stack[-1].append( (itmType, tmpItm[0]) ) inTag = [] + inTag[:-2] bracket -= (len(tmpItm)-1) while( len(inTag) > 0 and bracket < inTag[-1] ): if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] inTag = [] + inTag[:-2] yield stack
def _list_sent(sent): return [tokenize.whitespace(line) for line in tokenize.line(sent)]
def main(): import sys from optparse import OptionParser, OptionGroup usage = """%%prog [options] [grammar_file]""" % globals() opts = OptionParser(usage=usage) opts.add_option("-c", "--components", action="store_true", dest="show_components", default=0, help="show hole semantics components") opts.add_option("-r", "--raw", action="store_true", dest="show_raw", default=0, help="show the raw hole semantics expression") opts.add_option("-d", "--drawtrees", action="store_true", dest="draw_trees", default=0, help="show formula trees in a GUI window") opts.add_option("-v", "--verbose", action="count", dest="verbosity", default=0, help="show more information during parse") (options, args) = opts.parse_args() if len(args) > 0: filename = args[0] else: filename = 'hole.cfg' print 'Reading grammar file', filename grammar = GrammarFile.read_file(filename) parser = grammar.earley_parser(trace=options.verbosity) # Prompt the user for a sentence. print 'Sentence: ', line = sys.stdin.readline()[:-1] # Parse the sentence. tokens = list(tokenize.whitespace(line)) trees = parser.get_parse_list(tokens) print 'Got %d different parses' % len(trees) for tree in trees: # Get the semantic feature from the top of the parse tree. sem = tree[0].node['sem'].simplify() # Skolemise away all quantifiers. All variables become unique. sem = sem.skolemise() # Reparse the semantic representation from its bracketed string format. # I find this uniform structure easier to handle. It also makes the # code mostly independent of the lambda calculus classes. usr = bracket_parse(str(sem)) # Break the hole semantics representation down into its components # i.e. holes, labels, formula fragments and constraints. hole_sem = HoleSemantics(usr) # Maybe print the raw semantic representation. if options.show_raw: print print 'Raw expression' print usr # Maybe show the details of the semantic representation. if options.show_components: print print 'Holes: ', hole_sem.holes print 'Labels: ', hole_sem.labels print 'Constraints: ', hole_sem.constraints print 'Top hole: ', hole_sem.top_hole print 'Top labels: ', hole_sem.top_most_labels print 'Fragments:' for (l, f) in hole_sem.fragments.items(): print '\t%s: %s' % (l, f) # Find all the possible ways to plug the formulas together. pluggings = hole_sem.pluggings() # Build FOL formula trees using the pluggings. trees = map(hole_sem.formula_tree, pluggings) # Print out the formulas in a textual format. n = 1 for tree in trees: print print '%d. %s' % (n, tree) n += 1 # Maybe draw the formulas as trees. if options.draw_trees: draw_trees(*trees) print print 'Done.'
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from en.parser.nltk_lite import tokenize from en.parser.nltk_lite.parse import cfg, pcfg, pchart # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw John with my cookie', pcfg.toy1), ('the boy saw Jack with Bob under the table with a telescope', pcfg.toy2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i + 1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = list(tokenize.whitespace(sent)) # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideParse(grammar), pchart.RandomParse(grammar), pchart.UnsortedParse(grammar), pchart.LongestParse(grammar), pchart.BeamParse(len(tokens) + 1, grammar) ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print '\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, pcfg) parser.trace(3) t = time.time() parses = parser.get_parse_list(tokens) times.append(time.time() - t) if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print ' Parser | Time (secs) # Parses Average P(parse)' print '-------------------+------------------------------------------' for i in range(len(parsers)): print '%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, times[i], num_parses[i], average_p[i]) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print '-------------------+------------------------------------------' print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from en.parser.nltk_lite.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse
def string2words(s, sep='/'): return [tag2tuple(t, sep)[0] for t in tokenize.whitespace(s)]
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from en.parser.nltk_lite import tokenize from en.parser.nltk_lite.parse import cfg, pcfg, ViterbiParse # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw John with my cookie", pcfg.toy1), ("the boy saw Jack with Bob under the table with a telescope", pcfg.toy2), ] # Ask the user which demo they want to use. print for i in range(len(demos)): print "%3s: %s" % (i + 1, demos[i][0]) print " %r" % demos[i][1] print print "Which demo (%d-%d)? " % (1, len(demos)), try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print "Bad sentence number" return # Tokenize the sentence. tokens = list(tokenize.whitespace(sent)) parser = ViterbiParse(grammar) all_parses = {} print "\nsent: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar) parser.trace(3) t = time.time() parses = parser.get_parse_list(tokens) time = time.time() - t if parses: average = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: average = 0 num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print "Time (secs) # Parses Average P(parse)" print "-----------------------------------------" print "%11.4f%11d%19.14f" % (time, num_parses, average) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print "------------------------------------------" print "%11s%11d%19.14f" % ("n/a", len(parses), p) # Ask the user if we should draw the parses. print print "Draw parses (y/n)? ", if sys.stdin.readline().strip().lower().startswith("y"): from en.parser.nltk_lite.draw.tree import draw_trees print " please wait..." draw_trees(*parses) # Ask the user if we should print the parses. print print "Print parses (y/n)? ", if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print parse
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade): # allow any kind of bracketing for flexibility L_BRACKET = re.compile(r'[\(\[\{<]') R_BRACKET = re.compile(r'[\)\]\}>]') if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for s in data: bracket = 0 itmType = None stack = [tree.Tree(top_node, [])] inTag = [] for itm in list(tokenize.whitespace(s)): if L_BRACKET.match(itm[0]): bracket += 1 itm = itm[1:] matched = False if partial_match == True: for eachItm in chunk_types: if (len(eachItm) <= len(itm) and eachItm == itm[:len(eachItm)]): matched = True if collapse_partials == True: itm = eachItm else: if (chunk_types is not None and itm in chunk_types): matched = True if matched == True: # and inTag == 0: chunk = tree.Tree(itm, []) if cascade == True: stack.append(chunk) inTag += [bracket] else: if len(inTag) == 0: stack[-1].append(chunk) inTag += [bracket] itmType = itm if R_BRACKET.match(itm[-1]): tmpItm = split(itm, itm[-1]) if tmpItm != "": if len(inTag) > 0 and inTag[ -1] <= bracket: #inTag <= bracket: if cascade == True: stack[-1].append((itmType, tmpItm[0])) else: stack[-1][-1].append((itmType, tmpItm[0])) else: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] stack[-1].append((itmType, tmpItm[0])) inTag = [] + inTag[:-2] bracket -= (len(tmpItm) - 1) while (len(inTag) > 0 and bracket < inTag[-1]): if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] inTag = [] + inTag[:-2] yield stack
def set_sentence(self, sentence): self._sent = list(tokenize.whitespace(sentence)) #[XX] use tagged? self.reset()
def main(): import sys from optparse import OptionParser, OptionGroup usage = """%%prog [options] [grammar_file]""" % globals() opts = OptionParser(usage=usage) opts.add_option("-c", "--components", action="store_true", dest="show_components", default=0, help="show hole semantics components") opts.add_option("-r", "--raw", action="store_true", dest="show_raw", default=0, help="show the raw hole semantics expression") opts.add_option("-d", "--drawtrees", action="store_true", dest="draw_trees", default=0, help="show formula trees in a GUI window") opts.add_option("-v", "--verbose", action="count", dest="verbosity", default=0, help="show more information during parse") (options, args) = opts.parse_args() if len(args) > 0: filename = args[0] else: filename = 'hole.cfg' print 'Reading grammar file', filename grammar = GrammarFile.read_file(filename) parser = grammar.earley_parser(trace=options.verbosity) # Prompt the user for a sentence. print 'Sentence: ', line = sys.stdin.readline()[:-1] # Parse the sentence. tokens = list(tokenize.whitespace(line)) trees = parser.get_parse_list(tokens) print 'Got %d different parses' % len(trees) for tree in trees: # Get the semantic feature from the top of the parse tree. sem = tree[0].node['sem'].simplify() # Skolemise away all quantifiers. All variables become unique. sem = sem.skolemise() # Reparse the semantic representation from its bracketed string format. # I find this uniform structure easier to handle. It also makes the # code mostly independent of the lambda calculus classes. usr = bracket_parse(str(sem)) # Break the hole semantics representation down into its components # i.e. holes, labels, formula fragments and constraints. hole_sem = HoleSemantics(usr) # Maybe print the raw semantic representation. if options.show_raw: print print 'Raw expression' print usr # Maybe show the details of the semantic representation. if options.show_components: print print 'Holes: ', hole_sem.holes print 'Labels: ', hole_sem.labels print 'Constraints: ', hole_sem.constraints print 'Top hole: ', hole_sem.top_hole print 'Top labels: ', hole_sem.top_most_labels print 'Fragments:' for (l,f) in hole_sem.fragments.items(): print '\t%s: %s' % (l, f) # Find all the possible ways to plug the formulas together. pluggings = hole_sem.pluggings() # Build FOL formula trees using the pluggings. trees = map(hole_sem.formula_tree, pluggings) # Print out the formulas in a textual format. n = 1 for tree in trees: print print '%d. %s' % (n, tree) n += 1 # Maybe draw the formulas as trees. if options.draw_trees: draw_trees(*trees) print print 'Done.'