Ejemplo n.º 1
0
def demo():
    """
    A demonstration that shows the output of several different
    tokenizers on the same string.
    """

    from en.parser.nltk_lite import tokenize

    # Define the test string.
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    print 'Input text:'
    print `s`
    print
    print 'Tokenize using whitespace:'
    _display(tokenize.whitespace(s))
    print
    print 'Tokenize sequences of alphanumeric characters:'
    _display(tokenize.regexp(s, pattern=r'\w+', gaps=False))
    print
    print 'Tokenize sequences of letters and sequences of nonletters:'
    _display(tokenize.wordpunct(s))
    print
    print 'Tokenize by lines:'
    _display(tokenize.line(s))
    print
    print 'Tokenize by blank lines:'
    _display(tokenize.blankline(s))
    print
    print 'A simple sentence tokenizer:'
    _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True))
    print
Ejemplo n.º 2
0
def demo():
    """
    A demonstration that shows the output of several different
    tokenizers on the same string.
    """

    from en.parser.nltk_lite import tokenize

    # Define the test string.
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    print 'Input text:'
    print ` s `
    print
    print 'Tokenize using whitespace:'
    _display(tokenize.whitespace(s))
    print
    print 'Tokenize sequences of alphanumeric characters:'
    _display(tokenize.regexp(s, pattern=r'\w+', gaps=False))
    print
    print 'Tokenize sequences of letters and sequences of nonletters:'
    _display(tokenize.wordpunct(s))
    print
    print 'Tokenize by lines:'
    _display(tokenize.line(s))
    print
    print 'Tokenize by blank lines:'
    _display(tokenize.blankline(s))
    print
    print 'A simple sentence tokenizer:'
    _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True))
    print
Ejemplo n.º 3
0
def demo():
    import sys, time

    S = GrammarCategory.parse('S')
    VP = GrammarCategory.parse('VP')
    NP = GrammarCategory.parse('NP')
    PP = GrammarCategory.parse('PP')
    V = GrammarCategory.parse('V')
    N = GrammarCategory.parse('N')
    P = GrammarCategory.parse('P')
    Name = GrammarCategory.parse('Name')
    Det = GrammarCategory.parse('Det')
    DetSg = GrammarCategory.parse('Det[-pl]')
    DetPl = GrammarCategory.parse('Det[+pl]')
    NSg = GrammarCategory.parse('N[-pl]')
    NPl = GrammarCategory.parse('N[+pl]')

    # Define some grammatical productions.
    grammatical_productions = [
        cfg.Production(S, (NP, VP)),
        cfg.Production(PP, (P, NP)),
        cfg.Production(NP, (NP, PP)),
        cfg.Production(VP, (VP, PP)),
        cfg.Production(VP, (V, NP)),
        cfg.Production(VP, (V, )),
        cfg.Production(NP, (DetPl, NPl)),
        cfg.Production(NP, (DetSg, NSg))
    ]

    # Define some lexical productions.
    lexical_productions = [
        cfg.Production(NP, ('John', )),
        cfg.Production(NP, ('I', )),
        cfg.Production(Det, ('the', )),
        cfg.Production(Det, ('my', )),
        cfg.Production(Det, ('a', )),
        cfg.Production(NSg, ('dog', )),
        cfg.Production(NSg, ('cookie', )),
        cfg.Production(V, ('ate', )),
        cfg.Production(V, ('saw', )),
        cfg.Production(P, ('with', )),
        cfg.Production(P, ('under', )),
    ]

    earley_grammar = cfg.Grammar(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())

    sent = 'I saw John with a dog with my cookie'
    print("Sentence:\n", sent)
    from en.parser.nltk_lite import tokenize
    tokens = list(tokenize.whitespace(sent))
    t = time.time()
    cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1)
    trees = cp.get_parse_list(tokens)
    print("Time: %s" % (time.time() - t))
    for tree in trees:
        print(tree)
Ejemplo n.º 4
0
def text_parse(inputs, grammar, trace=0):
    """
    Convert input sentences into syntactic trees.
    """
    parses = {}
    for sent in inputs:
        tokens = list(tokenize.whitespace(sent))
        parser = grammar.earley_parser(trace=trace)
        syntrees = parser.get_parse_list(tokens)
        parses[sent] = syntrees
    return parses
Ejemplo n.º 5
0
def text_parse(inputs, grammar, trace=0):
    """
    Convert input sentences into syntactic trees.
    """
    parses = {}
    for sent in inputs:
        tokens = list(tokenize.whitespace(sent))
        parser = grammar.earley_parser(trace=trace)
        syntrees = parser.get_parse_list(tokens)
        parses[sent] = syntrees
    return parses
Ejemplo n.º 6
0
def text_parse(grammar, sent, trace=2, drawtrees=False, latex=False):
    parser = grammar.earley_parser(trace=trace)
    print(parser._grammar)
    tokens = list(tokenize.whitespace(sent))
    trees = parser.get_parse_list(tokens)
    if drawtrees:
        from treeview import TreeView
        TreeView(trees)
    else:
        for tree in trees:
            if latex: print(tree.latex_qtree())
            else: print(tree)
Ejemplo n.º 7
0
def demo():
    from en.parser.nltk_lite import tokenize, stem

    # Create a simple regular expression based stemmer
    stemmer = stem.Regexp('ing$|s$|e$', min=4)
    text = "John was eating icecream"
    tokens = tokenize.whitespace(text)

    # Print the results.
    print stemmer
    for word in tokens:
        print '%20s => %s' % (word, stemmer.stem(word))
    print
Ejemplo n.º 8
0
def demo():
    from en.parser.nltk_lite import tokenize, stem

    # Create a simple regular expression based stemmer
    stemmer = stem.Regexp('ing$|s$|e$', min=4)
    text = "John was eating icecream"
    tokens = tokenize.whitespace(text)

    # Print the results.
    print(stemmer)
    for word in tokens:
        print('%20s => %s' % (word, stemmer.stem(word)))
    print()
Ejemplo n.º 9
0
def demo():
    import sys, time

    S = GrammarCategory.parse('S')
    VP = GrammarCategory.parse('VP')
    NP = GrammarCategory.parse('NP')
    PP = GrammarCategory.parse('PP')
    V = GrammarCategory.parse('V')
    N = GrammarCategory.parse('N')
    P = GrammarCategory.parse('P')
    Name = GrammarCategory.parse('Name')
    Det = GrammarCategory.parse('Det')
    DetSg = GrammarCategory.parse('Det[-pl]')
    DetPl = GrammarCategory.parse('Det[+pl]')
    NSg = GrammarCategory.parse('N[-pl]')
    NPl = GrammarCategory.parse('N[+pl]')

    # Define some grammatical productions.
    grammatical_productions = [
        cfg.Production(S, (NP, VP)),  cfg.Production(PP, (P, NP)),
        cfg.Production(NP, (NP, PP)),
        cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
        cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
        cfg.Production(NP, (DetSg, NSg))]

    # Define some lexical productions.
    lexical_productions = [
        cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)),
        cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)),
        cfg.Production(Det, ('a',)),
        cfg.Production(NSg, ('dog',)),   cfg.Production(NSg, ('cookie',)),
        cfg.Production(V, ('ate',)),  cfg.Production(V, ('saw',)),
        cfg.Production(P, ('with',)), cfg.Production(P, ('under',)),
        ]
    
    earley_grammar = cfg.Grammar(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())

    sent = 'I saw John with a dog with my cookie'
    print "Sentence:\n", sent
    from en.parser.nltk_lite import tokenize
    tokens = list(tokenize.whitespace(sent))
    t = time.time()
    cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1)
    trees = cp.get_parse_list(tokens)
    print "Time: %s" % (time.time() - t)
    for tree in trees: print tree
Ejemplo n.º 10
0
def raw(files='english-kjv'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "genesis", file + ".txt")
        s = open(path).read()
        for t in tokenize.whitespace(s):
            yield t
Ejemplo n.º 11
0
def raw(files = 'english-kjv'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "genesis", file+".txt")
        s = open(path).read()
        for t in tokenize.whitespace(s):
            yield t
Ejemplo n.º 12
0
def text_parse(grammar, sent, trace=2, drawtrees=False, latex=False):
    parser = grammar.earley_parser(trace=trace)
    print parser._grammar
    tokens = list(tokenize.whitespace(sent))
    trees = parser.get_parse_list(tokens)
    if drawtrees:
        from treeview import TreeView

        TreeView(trees)
    else:
        for tree in trees:
            if latex:
                print tree.latex_qtree()
            else:
                print tree
Ejemplo n.º 13
0
def raw(files='raw'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{list(string)}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            l = []
            for t in tokenize.whitespace(sent):
                l.append(t)
            yield l
Ejemplo n.º 14
0
def demo():
    """
    Create a shift reduce parser demo, using a simple grammar and
    text. 
    """

    from en.parser.nltk_lite.parse import cfg
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [cfg.Nonterminal(s) for s in nonterminals.split()]

    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(NP, [NP, PP]),
        cfg.Production(VP, [VP, PP]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(VP, [V, NP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),
        cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),
        cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),
        cfg.Production(P, ['in']),
        cfg.Production(P, ['with']),
        cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),
        cfg.Production(N, ['statue']),
        cfg.Production(Det, ['my']),
    )

    grammar = cfg.Grammar(S, productions)

    # tokenize the sentence
    sent = list(
        tokenize.whitespace('my dog saw a man in the park with a statue'))

    ShiftReduceDemo(grammar, sent).mainloop()
Ejemplo n.º 15
0
def demo():
    """
    A demonstration of the recursive descent parser.
    """

    from en.parser.nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
    V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, 'saw', NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),
        cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),
        cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),
        cfg.Production(P, ['in']),
        cfg.Production(P, ['with']),
        cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),
        cfg.Production(N, ['telescope']))
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace('I saw a man in the park'))

    # Define a list of parsers.
    parser = RecursiveDescent(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
Ejemplo n.º 16
0
def demo():
    """
    A demonstration of the shift-reduce parser.
    """

    from en.parser.nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals("S, VP, NP, PP")
    V, N, P, Name, Det = cfg.nonterminals("V, N, P, Name, Det")

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, "saw", NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),
        # Lexical Productions
        cfg.Production(NP, ["I"]),
        cfg.Production(Det, ["the"]),
        cfg.Production(Det, ["a"]),
        cfg.Production(N, ["man"]),
        cfg.Production(V, ["saw"]),
        cfg.Production(P, ["in"]),
        cfg.Production(P, ["with"]),
        cfg.Production(N, ["park"]),
        cfg.Production(N, ["dog"]),
        cfg.Production(N, ["telescope"]),
    )
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace("I saw a man in the park"))

    parser = ShiftReduce(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
Ejemplo n.º 17
0
def demo():
    """
    Create a shift reduce parser demo, using a simple grammar and
    text. 
    """

    from en.parser.nltk_lite.parse import cfg

    nonterminals = "S VP NP PP P N Name V Det"
    (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s) for s in nonterminals.split()]

    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(NP, [NP, PP]),
        cfg.Production(VP, [VP, PP]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(VP, [V, NP]),
        cfg.Production(PP, [P, NP]),
        # Lexical Productions
        cfg.Production(NP, ["I"]),
        cfg.Production(Det, ["the"]),
        cfg.Production(Det, ["a"]),
        cfg.Production(N, ["man"]),
        cfg.Production(V, ["saw"]),
        cfg.Production(P, ["in"]),
        cfg.Production(P, ["with"]),
        cfg.Production(N, ["park"]),
        cfg.Production(N, ["dog"]),
        cfg.Production(N, ["statue"]),
        cfg.Production(Det, ["my"]),
    )

    grammar = cfg.Grammar(S, productions)

    # tokenize the sentence
    sent = list(tokenize.whitespace("my dog saw a man in the park with a statue"))

    ShiftReduceDemo(grammar, sent).mainloop()
Ejemplo n.º 18
0
def demo():
    """
    A demonstration of the recursive descent parser.
    """

    from en.parser.nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
    V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, 'saw', NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),   cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),  cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),  cfg.Production(P, ['in']),
        cfg.Production(P, ['with']), cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),  cfg.Production(N, ['telescope'])
        )
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace('I saw a man in the park'))

    # Define a list of parsers.
    parser = RecursiveDescent(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
Ejemplo n.º 19
0
def demo():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """
    from en.parser.nltk_lite.parse import cfg
    grammar = cfg.parse_grammar("""
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """)

    sent = list(tokenize.whitespace('the dog saw a man in the park'))

    RecursiveDescentDemo(grammar, sent).mainloop()
Ejemplo n.º 20
0
def demo():
    """
    Create a recursive descent parser demo, using a simple grammar and
    text.
    """    
    from en.parser.nltk_lite.parse import cfg
    grammar = cfg.parse_grammar("""
    # Grammatical productions.
        S -> NP VP
        NP -> Det N PP | Det N
        VP -> V NP PP | V NP | V
        PP -> P NP
    # Lexical productions.
        NP -> 'I'
        Det -> 'the' | 'a'
        N -> 'man' | 'park' | 'dog' | 'telescope'
        V -> 'ate' | 'saw'
        P -> 'in' | 'under' | 'with'
    """)

    sent = list(tokenize.whitespace('the dog saw a man in the park'))

    RecursiveDescentDemo(grammar, sent).mainloop()
Ejemplo n.º 21
0
 def set_sentence(self, sentence):
     self._sent = list(tokenize.whitespace(sentence))  # [XX] use tagged?
     self.reset()
Ejemplo n.º 22
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from en.parser.nltk_lite import tokenize
    from en.parser.nltk_lite.parse import cfg, pcfg, ViterbiParse

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my cookie', pcfg.toy1),
             ('the boy saw Jack with Bob under the table with a telescope',
              pcfg.toy2)]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print '%3s: %s' % (i + 1, demos[i][0])
        print '     %r' % demos[i][1]
        print
    print 'Which demo (%d-%d)? ' % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print 'Bad sentence number'
        return

    # Tokenize the sentence.
    tokens = list(tokenize.whitespace(sent))

    parser = ViterbiParse(grammar)
    all_parses = {}

    print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)
    parser.trace(3)
    t = time.time()
    parses = parser.get_parse_list(tokens)
    time = time.time() - t
    if parses:
        average = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        average = 0
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print 'Time (secs)   # Parses   Average P(parse)'
    print '-----------------------------------------'
    print '%11.4f%11d%19.14f' % (time, num_parses, average)
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print '------------------------------------------'
    print '%11s%11d%19.14f' % ('n/a', len(parses), p)

    # Ask the user if we should draw the parses.
    print
    print 'Draw parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        from en.parser.nltk_lite.draw.tree import draw_trees
        print '  please wait...'
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print 'Print parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print parse
Ejemplo n.º 23
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from en.parser.nltk_lite import tokenize
    from en.parser.nltk_lite.parse import cfg, pcfg, pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my cookie', pcfg.toy1),
             ('the boy saw Jack with Bob under the table with a telescope',
              pcfg.toy2)]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print '%3s: %s' % (i+1, demos[i][0])
        print '     %r' % demos[i][1]
        print
    print 'Which demo (%d-%d)? ' % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip())-1
        sent, grammar = demos[snum]
    except:
        print 'Bad sentence number'
        return

    # Tokenize the sentence.
    tokens = list(tokenize.whitespace(sent))

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideParse(grammar),
        pchart.RandomParse(grammar),
        pchart.UnsortedParse(grammar),
        pchart.LongestParse(grammar),
        pchart.BeamParse(len(tokens)+1, grammar)
        ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,pcfg)
        parser.trace(3)
        t = time.time()
        parses = parser.get_parse_list(tokens)
        times.append(time.time()-t)
        if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
        else: p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses: all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print '       Parser      | Time (secs)   # Parses   Average P(parse)'
    print '-------------------+------------------------------------------'
    for i in range(len(parsers)):
        print '%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                         times[i],num_parses[i],average_p[i])
    parses = all_parses.keys()
    if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
    else: p = 0
    print '-------------------+------------------------------------------'
    print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)

    # Ask the user if we should draw the parses.
    print
    print 'Draw parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        from en.parser.nltk_lite.draw.tree import draw_trees
        print '  please wait...'
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print 'Print parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print parse
Ejemplo n.º 24
0
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade):
    # allow any kind of bracketing for flexibility

    L_BRACKET = re.compile(r'[\(\[\{<]')
    R_BRACKET = re.compile(r'[\)\]\}>]')

    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for s in data:
            bracket = 0
            itmType = None
            stack = [tree.Tree(top_node, [])]
            inTag = []
            for itm in list(tokenize.whitespace(s)):
                if L_BRACKET.match(itm[0]):
                    bracket += 1
                    itm = itm[1:]
                    matched = False
                    if partial_match == True:
                        for eachItm in chunk_types:
                           if (len(eachItm) <= len(itm) and 
                               eachItm == itm[:len(eachItm)]):
                               matched = True
                               if collapse_partials == True:
                                   itm = eachItm
                    else:
                        if (chunk_types is not None and
                            itm in chunk_types):
                            matched = True
                    if matched == True: # and inTag == 0:
                        chunk = tree.Tree(itm, [])
                        if cascade == True:
                            stack.append(chunk)
                            inTag += [bracket]
                        else:
                            if len(inTag) == 0:
                                stack[-1].append(chunk)
                                inTag += [bracket]
                    itmType=itm
                if R_BRACKET.match(itm[-1]):
                    tmpItm = split(itm, itm[-1])
                    if tmpItm != "":
                        if len(inTag) > 0 and inTag[-1] <= bracket: #inTag <= bracket:
                            if cascade == True:
                                stack[-1].append( (itmType, tmpItm[0]) )
                            else:
                                stack[-1][-1].append( (itmType, tmpItm[0]) )
                        else:
                            if cascade == True:
                                if len(stack) > 1:
                                    stack[-2].append(stack[-1])
                                    stack = stack[:-1]
                            stack[-1].append( (itmType, tmpItm[0]) )
                            inTag = [] + inTag[:-2]
                    bracket -= (len(tmpItm)-1)
                    while( len(inTag) > 0 and bracket < inTag[-1] ):
                        if cascade == True:
                            if len(stack) > 1:
                                stack[-2].append(stack[-1])
                                stack = stack[:-1]
                        inTag = [] + inTag[:-2]
            yield stack
Ejemplo n.º 25
0
def _list_sent(sent):
    return [tokenize.whitespace(line) for line in tokenize.line(sent)]
Ejemplo n.º 26
0
def main():
    import sys
    from optparse import OptionParser, OptionGroup
    usage = """%%prog [options] [grammar_file]""" % globals()

    opts = OptionParser(usage=usage)
    opts.add_option("-c",
                    "--components",
                    action="store_true",
                    dest="show_components",
                    default=0,
                    help="show hole semantics components")
    opts.add_option("-r",
                    "--raw",
                    action="store_true",
                    dest="show_raw",
                    default=0,
                    help="show the raw hole semantics expression")
    opts.add_option("-d",
                    "--drawtrees",
                    action="store_true",
                    dest="draw_trees",
                    default=0,
                    help="show formula trees in a GUI window")
    opts.add_option("-v",
                    "--verbose",
                    action="count",
                    dest="verbosity",
                    default=0,
                    help="show more information during parse")

    (options, args) = opts.parse_args()

    if len(args) > 0:
        filename = args[0]
    else:
        filename = 'hole.cfg'

    print 'Reading grammar file', filename
    grammar = GrammarFile.read_file(filename)
    parser = grammar.earley_parser(trace=options.verbosity)

    # Prompt the user for a sentence.
    print 'Sentence: ',
    line = sys.stdin.readline()[:-1]

    # Parse the sentence.
    tokens = list(tokenize.whitespace(line))
    trees = parser.get_parse_list(tokens)
    print 'Got %d different parses' % len(trees)

    for tree in trees:
        # Get the semantic feature from the top of the parse tree.
        sem = tree[0].node['sem'].simplify()

        # Skolemise away all quantifiers.  All variables become unique.
        sem = sem.skolemise()

        # Reparse the semantic representation from its bracketed string format.
        # I find this uniform structure easier to handle.  It also makes the
        # code mostly independent of the lambda calculus classes.
        usr = bracket_parse(str(sem))

        # Break the hole semantics representation down into its components
        # i.e. holes, labels, formula fragments and constraints.
        hole_sem = HoleSemantics(usr)

        # Maybe print the raw semantic representation.
        if options.show_raw:
            print
            print 'Raw expression'
            print usr

        # Maybe show the details of the semantic representation.
        if options.show_components:
            print
            print 'Holes:       ', hole_sem.holes
            print 'Labels:      ', hole_sem.labels
            print 'Constraints: ', hole_sem.constraints
            print 'Top hole:    ', hole_sem.top_hole
            print 'Top labels:  ', hole_sem.top_most_labels
            print 'Fragments:'
            for (l, f) in hole_sem.fragments.items():
                print '\t%s: %s' % (l, f)

        # Find all the possible ways to plug the formulas together.
        pluggings = hole_sem.pluggings()

        # Build FOL formula trees using the pluggings.
        trees = map(hole_sem.formula_tree, pluggings)

        # Print out the formulas in a textual format.
        n = 1
        for tree in trees:
            print
            print '%d. %s' % (n, tree)
            n += 1

        # Maybe draw the formulas as trees.
        if options.draw_trees:
            draw_trees(*trees)

        print
        print 'Done.'
Ejemplo n.º 27
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from en.parser.nltk_lite import tokenize
    from en.parser.nltk_lite.parse import cfg, pcfg, pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my cookie', pcfg.toy1),
             ('the boy saw Jack with Bob under the table with a telescope',
              pcfg.toy2)]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print '%3s: %s' % (i + 1, demos[i][0])
        print '     %r' % demos[i][1]
        print
    print 'Which demo (%d-%d)? ' % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print 'Bad sentence number'
        return

    # Tokenize the sentence.
    tokens = list(tokenize.whitespace(sent))

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideParse(grammar),
        pchart.RandomParse(grammar),
        pchart.UnsortedParse(grammar),
        pchart.LongestParse(grammar),
        pchart.BeamParse(len(tokens) + 1, grammar)
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print '\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, pcfg)
        parser.trace(3)
        t = time.time()
        parses = parser.get_parse_list(tokens)
        times.append(time.time() - t)
        if parses:
            p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
        else:
            p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print '       Parser      | Time (secs)   # Parses   Average P(parse)'
    print '-------------------+------------------------------------------'
    for i in range(len(parsers)):
        print '%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                           times[i], num_parses[i],
                                           average_p[i])
    parses = all_parses.keys()
    if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else: p = 0
    print '-------------------+------------------------------------------'
    print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)

    # Ask the user if we should draw the parses.
    print
    print 'Draw parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        from en.parser.nltk_lite.draw.tree import draw_trees
        print '  please wait...'
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print 'Print parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print parse
Ejemplo n.º 28
0
def string2words(s, sep='/'):
    return [tag2tuple(t, sep)[0] for t in tokenize.whitespace(s)]
Ejemplo n.º 29
0
def _list_sent(sent):
    return [tokenize.whitespace(line) for line in tokenize.line(sent)]
Ejemplo n.º 30
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from en.parser.nltk_lite import tokenize
    from en.parser.nltk_lite.parse import cfg, pcfg, ViterbiParse

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw John with my cookie", pcfg.toy1),
        ("the boy saw Jack with Bob under the table with a telescope", pcfg.toy2),
    ]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print "%3s: %s" % (i + 1, demos[i][0])
        print "     %r" % demos[i][1]
        print
    print "Which demo (%d-%d)? " % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print "Bad sentence number"
        return

    # Tokenize the sentence.
    tokens = list(tokenize.whitespace(sent))

    parser = ViterbiParse(grammar)
    all_parses = {}

    print "\nsent: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar)
    parser.trace(3)
    t = time.time()
    parses = parser.get_parse_list(tokens)
    time = time.time() - t
    if parses:
        average = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        average = 0
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print "Time (secs)   # Parses   Average P(parse)"
    print "-----------------------------------------"
    print "%11.4f%11d%19.14f" % (time, num_parses, average)
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print "------------------------------------------"
    print "%11s%11d%19.14f" % ("n/a", len(parses), p)

    # Ask the user if we should draw the parses.
    print
    print "Draw parses (y/n)? ",
    if sys.stdin.readline().strip().lower().startswith("y"):
        from en.parser.nltk_lite.draw.tree import draw_trees

        print "  please wait..."
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print "Print parses (y/n)? ",
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print parse
Ejemplo n.º 31
0
def _chunk_parse(files, chunk_types, top_node, partial_match,
                 collapse_partials, cascade):
    # allow any kind of bracketing for flexibility

    L_BRACKET = re.compile(r'[\(\[\{<]')
    R_BRACKET = re.compile(r'[\)\]\}>]')

    if type(files) is str: files = (files, )
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for s in data:
            bracket = 0
            itmType = None
            stack = [tree.Tree(top_node, [])]
            inTag = []
            for itm in list(tokenize.whitespace(s)):
                if L_BRACKET.match(itm[0]):
                    bracket += 1
                    itm = itm[1:]
                    matched = False
                    if partial_match == True:
                        for eachItm in chunk_types:
                            if (len(eachItm) <= len(itm)
                                    and eachItm == itm[:len(eachItm)]):
                                matched = True
                                if collapse_partials == True:
                                    itm = eachItm
                    else:
                        if (chunk_types is not None and itm in chunk_types):
                            matched = True
                    if matched == True:  # and inTag == 0:
                        chunk = tree.Tree(itm, [])
                        if cascade == True:
                            stack.append(chunk)
                            inTag += [bracket]
                        else:
                            if len(inTag) == 0:
                                stack[-1].append(chunk)
                                inTag += [bracket]
                    itmType = itm
                if R_BRACKET.match(itm[-1]):
                    tmpItm = split(itm, itm[-1])
                    if tmpItm != "":
                        if len(inTag) > 0 and inTag[
                                -1] <= bracket:  #inTag <= bracket:
                            if cascade == True:
                                stack[-1].append((itmType, tmpItm[0]))
                            else:
                                stack[-1][-1].append((itmType, tmpItm[0]))
                        else:
                            if cascade == True:
                                if len(stack) > 1:
                                    stack[-2].append(stack[-1])
                                    stack = stack[:-1]
                            stack[-1].append((itmType, tmpItm[0]))
                            inTag = [] + inTag[:-2]
                    bracket -= (len(tmpItm) - 1)
                    while (len(inTag) > 0 and bracket < inTag[-1]):
                        if cascade == True:
                            if len(stack) > 1:
                                stack[-2].append(stack[-1])
                                stack = stack[:-1]
                        inTag = [] + inTag[:-2]
            yield stack
Ejemplo n.º 32
0
 def set_sentence(self, sentence):
     self._sent = list(tokenize.whitespace(sentence)) #[XX] use tagged?
     self.reset()
Ejemplo n.º 33
0
def main():
    import sys
    from optparse import OptionParser, OptionGroup
    usage = """%%prog [options] [grammar_file]""" % globals()

    opts = OptionParser(usage=usage)
    opts.add_option("-c", "--components",
	action="store_true", dest="show_components", default=0,
	help="show hole semantics components")
    opts.add_option("-r", "--raw",
	action="store_true", dest="show_raw", default=0,
	help="show the raw hole semantics expression")
    opts.add_option("-d", "--drawtrees",
	action="store_true", dest="draw_trees", default=0,
	help="show formula trees in a GUI window")
    opts.add_option("-v", "--verbose",
	action="count", dest="verbosity", default=0,
	help="show more information during parse")

    (options, args) = opts.parse_args()

    if len(args) > 0:
        filename = args[0]
    else:
        filename = 'hole.cfg'

    print 'Reading grammar file', filename
    grammar = GrammarFile.read_file(filename)
    parser = grammar.earley_parser(trace=options.verbosity)

    # Prompt the user for a sentence.
    print 'Sentence: ',
    line = sys.stdin.readline()[:-1]

    # Parse the sentence.
    tokens = list(tokenize.whitespace(line))
    trees = parser.get_parse_list(tokens)
    print 'Got %d different parses' % len(trees)

    for tree in trees:
        # Get the semantic feature from the top of the parse tree.
        sem = tree[0].node['sem'].simplify()

        # Skolemise away all quantifiers.  All variables become unique.
        sem = sem.skolemise()

        # Reparse the semantic representation from its bracketed string format.
        # I find this uniform structure easier to handle.  It also makes the
        # code mostly independent of the lambda calculus classes.
        usr = bracket_parse(str(sem))

        # Break the hole semantics representation down into its components
        # i.e. holes, labels, formula fragments and constraints.
        hole_sem = HoleSemantics(usr)

        # Maybe print the raw semantic representation.
        if options.show_raw:
            print
            print 'Raw expression'
            print usr

        # Maybe show the details of the semantic representation.
        if options.show_components:
            print
            print 'Holes:       ', hole_sem.holes
            print 'Labels:      ', hole_sem.labels
            print 'Constraints: ', hole_sem.constraints
            print 'Top hole:    ', hole_sem.top_hole
            print 'Top labels:  ', hole_sem.top_most_labels
            print 'Fragments:'
            for (l,f) in hole_sem.fragments.items():
                print '\t%s: %s' % (l, f)

        # Find all the possible ways to plug the formulas together.
        pluggings = hole_sem.pluggings()

        # Build FOL formula trees using the pluggings.
        trees = map(hole_sem.formula_tree, pluggings)

        # Print out the formulas in a textual format.
        n = 1
        for tree in trees:
            print
            print '%d. %s' % (n, tree)
            n += 1

        # Maybe draw the formulas as trees.
        if options.draw_trees:
            draw_trees(*trees)

        print
        print 'Done.'