Ejemplo n.º 1
def demo():
    A demonstration showing how PCFG C{Grammar}s can be created and used.

    from en.parser.nltk_lite.corpora import treebank, extract
    from en.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms
    from itertools import islice

    # Create some probabilistic CFG Productions
    S, A, B, C = cfg.nonterminals('S A B C')
    pcfg_prods = [
        pcfg.Production(A, [B, B], prob=0.3),
        pcfg.Production(A, [C, B, C], prob=0.7),
        pcfg.Production(B, [B, 'b'], prob=0.5),
        pcfg.Production(B, [C], prob=0.5),
        pcfg.Production(C, ['a'], prob=0.1),
        pcfg.Production(C, ['b'], prob=0.9)

    pcfg_prod = pcfg_prods[2]
    print 'A PCFG production:', ` pcfg_prod `
    print '    pcfg_prod.lhs()  =>', ` pcfg_prod.lhs() `
    print '    pcfg_prod.rhs()  =>', ` pcfg_prod.rhs() `
    print '    pcfg_prod.prob() =>', ` pcfg_prod.prob() `

    # Create and print a PCFG
    grammar = pcfg.Grammar(S, pcfg_prods)
    print 'A PCFG grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26)

    # extract productions from three trees and induce the PCFG
    print "Induce PCFG grammar from treebank data:"

    productions = []
    for tree in islice(treebank.parsed(), 3):
        # perform optional in-place tree transformations, e.g.:
        # treetransforms.collapseUnary(tree, collapsePOS = False)
        # treetransforms.chomskyNormalForm(tree, horzMarkov = 2)

        productions += tree.productions()

    grammar = pcfg.induce(S, productions)
    print grammar

    print "Parse sentence using induced grammar:"

    parser = pchart.InsideParse(grammar)

    sent = extract(0, treebank.raw())
    print sent
    for parse in parser.get_parse_list(sent):
        print parse
Ejemplo n.º 2
def demo():
    A demonstration showing how PCFG C{Grammar}s can be created and used.

    from en.parser.nltk_lite.corpora import treebank, extract
    from en.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms
    from itertools import islice

    # Create some probabilistic CFG Productions
    S, A, B, C = cfg.nonterminals("S A B C")
    pcfg_prods = [
        pcfg.Production(A, [B, B], prob=0.3),
        pcfg.Production(A, [C, B, C], prob=0.7),
        pcfg.Production(B, [B, "b"], prob=0.5),
        pcfg.Production(B, [C], prob=0.5),
        pcfg.Production(C, ["a"], prob=0.1),
        pcfg.Production(C, ["b"], prob=0.9),

    pcfg_prod = pcfg_prods[2]
    print "A PCFG production:", ` pcfg_prod `
    print "    pcfg_prod.lhs()  =>", ` pcfg_prod.lhs() `
    print "    pcfg_prod.rhs()  =>", ` pcfg_prod.rhs() `
    print "    pcfg_prod.prob() =>", ` pcfg_prod.prob() `

    # Create and print a PCFG
    grammar = pcfg.Grammar(S, pcfg_prods)
    print "A PCFG grammar:", ` grammar `
    print "    grammar.start()       =>", ` grammar.start() `
    print "    grammar.productions() =>",
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(",", ",\n" + " " * 26)

    # extract productions from three trees and induce the PCFG
    print "Induce PCFG grammar from treebank data:"

    productions = []
    for tree in islice(treebank.parsed(), 3):
        # perform optional in-place tree transformations, e.g.:
        # treetransforms.collapseUnary(tree, collapsePOS = False)
        # treetransforms.chomskyNormalForm(tree, horzMarkov = 2)

        productions += tree.productions()

    grammar = pcfg.induce(S, productions)
    print grammar

    print "Parse sentence using induced grammar:"

    parser = pchart.InsideParse(grammar)

    sent = extract(0, treebank.raw())
    print sent
    for parse in parser.get_parse_list(sent):
        print parse
Ejemplo n.º 3
def demo():
    from en.parser.nltk_lite.corpora import treebank
    from itertools import islice

    print "Parsed:"
    for tree in islice(treebank.parsed(), 3):
        print tree.pp()

    print "Chunked:"
    for tree in islice(treebank.chunked(), 3):
        print tree.pp()

    print "Tagged:"
    for sent in islice(treebank.tagged(), 3):
        print sent

    print "Raw:"
    for sent in islice(treebank.raw(), 3):
        print sent
Ejemplo n.º 4
def demo():
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.

    from en.parser.nltk_lite.corpora import treebank
    from en.parser.nltk_lite import stem

    stemmer = stem.Porter()

    i = 0
    orig = []
    stemmed = []
    for sent in treebank.raw():
        for word in sent:
            sword = stemmer.stem(word)
        i += 1
        if i > 3: break

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print('*' * 70)
Ejemplo n.º 5
def demo():
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.

    from en.parser.nltk_lite.corpora import treebank
    from en.parser.nltk_lite import stem

    stemmer = stem.Porter()

    i = 0
    orig = []
    stemmed = []
    for sent in treebank.raw():
        for word in sent:
            sword = stemmer.stem(word)
        if i>3: break

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
    print original
    print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
    print results
    print '*'*70