Example #1
0
File: pcfg.py Project: mgolden/en
def demo():
    """
    A demonstration showing how PCFG C{Grammar}s can be created and used.
    """

    from en.parser.nltk_lite.corpora import treebank, extract
    from en.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms
    from itertools import islice

    # Create some probabilistic CFG Productions
    S, A, B, C = cfg.nonterminals('S A B C')
    pcfg_prods = [
        pcfg.Production(A, [B, B], prob=0.3),
        pcfg.Production(A, [C, B, C], prob=0.7),
        pcfg.Production(B, [B, 'b'], prob=0.5),
        pcfg.Production(B, [C], prob=0.5),
        pcfg.Production(C, ['a'], prob=0.1),
        pcfg.Production(C, ['b'], prob=0.9)
    ]

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    # Create and print a PCFG
    grammar = pcfg.Grammar(S, pcfg_prods)
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use str.replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for tree in islice(treebank.parsed(), 3):
        # perform optional in-place tree transformations, e.g.:
        # treetransforms.collapseUnary(tree, collapsePOS = False)
        # treetransforms.chomskyNormalForm(tree, horzMarkov = 2)

        productions += tree.productions()

    grammar = pcfg.induce(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideParse(grammar)
    parser.trace(3)

    sent = extract(0, treebank.raw())
    print(sent)
    for parse in parser.get_parse_list(sent):
        print(parse)
Example #2
0
def demo():
    """
    A demonstration showing how PCFG C{Grammar}s can be created and used.
    """

    from en.parser.nltk_lite.corpora import treebank, extract
    from en.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms
    from itertools import islice

    # Create some probabilistic CFG Productions
    S, A, B, C = cfg.nonterminals("S A B C")
    pcfg_prods = [
        pcfg.Production(A, [B, B], prob=0.3),
        pcfg.Production(A, [C, B, C], prob=0.7),
        pcfg.Production(B, [B, "b"], prob=0.5),
        pcfg.Production(B, [C], prob=0.5),
        pcfg.Production(C, ["a"], prob=0.1),
        pcfg.Production(C, ["b"], prob=0.9),
    ]

    pcfg_prod = pcfg_prods[2]
    print "A PCFG production:", ` pcfg_prod `
    print "    pcfg_prod.lhs()  =>", ` pcfg_prod.lhs() `
    print "    pcfg_prod.rhs()  =>", ` pcfg_prod.rhs() `
    print "    pcfg_prod.prob() =>", ` pcfg_prod.prob() `
    print

    # Create and print a PCFG
    grammar = pcfg.Grammar(S, pcfg_prods)
    print "A PCFG grammar:", ` grammar `
    print "    grammar.start()       =>", ` grammar.start() `
    print "    grammar.productions() =>",
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(",", ",\n" + " " * 26)
    print

    # extract productions from three trees and induce the PCFG
    print "Induce PCFG grammar from treebank data:"

    productions = []
    for tree in islice(treebank.parsed(), 3):
        # perform optional in-place tree transformations, e.g.:
        # treetransforms.collapseUnary(tree, collapsePOS = False)
        # treetransforms.chomskyNormalForm(tree, horzMarkov = 2)

        productions += tree.productions()

    grammar = pcfg.induce(S, productions)
    print grammar
    print

    print "Parse sentence using induced grammar:"

    parser = pchart.InsideParse(grammar)
    parser.trace(3)

    sent = extract(0, treebank.raw())
    print sent
    for parse in parser.get_parse_list(sent):
        print parse
Example #3
0
def demo():
    """
    A demonstration showing how C{Grammar}s can be created and used.
    """

    from en.parser.nltk_lite.parse import cfg

    # Create some nonterminals
    S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP')
    N, V, P, Det = cfg.nonterminals('N, V, P, Det')
    VP_slash_NP = VP / NP

    print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP]
    print '    S.symbol() =>', ` S.symbol() `
    print

    print cfg.Production(S, [NP])

    # Create some Grammar Productions
    grammar = cfg.parse_grammar("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    N -> 'dog'
    N -> 'cat'
    V -> 'chased'
    V -> 'sat'
    P -> 'on'
    P -> 'in'
    """)

    print 'A Grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 25)
    print
Example #4
0
def demo():
    """
    A demonstration of the recursive descent parser.
    """

    from en.parser.nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
    V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, 'saw', NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),
        cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),
        cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),
        cfg.Production(P, ['in']),
        cfg.Production(P, ['with']),
        cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),
        cfg.Production(N, ['telescope']))
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace('I saw a man in the park'))

    # Define a list of parsers.
    parser = RecursiveDescent(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
Example #5
0
def demo():
    """
    A demonstration showing how C{Grammar}s can be created and used.
    """

    from en.parser.nltk_lite.parse import cfg

    # Create some nonterminals
    S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP')
    N, V, P, Det = cfg.nonterminals('N, V, P, Det')
    VP_slash_NP = VP/NP

    print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP]
    print '    S.symbol() =>', `S.symbol()`
    print

    print cfg.Production(S, [NP])

    # Create some Grammar Productions
    grammar = cfg.parse_grammar("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    N -> 'dog'
    N -> 'cat'
    V -> 'chased'
    V -> 'sat'
    P -> 'on'
    P -> 'in'
    """)

    print 'A Grammar:', `grammar`
    print '    grammar.start()       =>', `grammar.start()`
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print `grammar.productions()`.replace(',', ',\n'+' '*25)
    print
Example #6
0
def demo():
    """
    A demonstration of the shift-reduce parser.
    """

    from en.parser.nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals("S, VP, NP, PP")
    V, N, P, Name, Det = cfg.nonterminals("V, N, P, Name, Det")

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, "saw", NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),
        # Lexical Productions
        cfg.Production(NP, ["I"]),
        cfg.Production(Det, ["the"]),
        cfg.Production(Det, ["a"]),
        cfg.Production(N, ["man"]),
        cfg.Production(V, ["saw"]),
        cfg.Production(P, ["in"]),
        cfg.Production(P, ["with"]),
        cfg.Production(N, ["park"]),
        cfg.Production(N, ["dog"]),
        cfg.Production(N, ["telescope"]),
    )
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace("I saw a man in the park"))

    parser = ShiftReduce(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
Example #7
0
def demo():
    """
    A demonstration of the recursive descent parser.
    """

    from en.parser.nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
    V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, 'saw', NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),   cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),  cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),  cfg.Production(P, ['in']),
        cfg.Production(P, ['with']), cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),  cfg.Production(N, ['telescope'])
        )
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace('I saw a man in the park'))

    # Define a list of parsers.
    parser = RecursiveDescent(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
Example #8
0
    lcount = {} # LHS-count: counts the number of times a given lhs occurs

    for prod in productions:
        lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
        pcount[prod]       = pcount.get(prod,       0) + 1

    prods = [Production(p.lhs(), p.rhs(), prob=float(pcount[p]) / lcount[p.lhs()])\
             for p in pcount]
    return Grammar(start, prods)


#################################################################
# Toy PCFGs
#################################################################

_S, _VP, _NP, _PP = cfg.nonterminals('S, VP, NP, PP')
_V, _N, _P, _Name, _Det = cfg.nonterminals('V, N, P, Name, Det')

toy1 = Grammar(_S, [
    Production(_NP, [_Det, _N], prob=0.5),
    Production(_NP, [_NP, _PP], prob=0.25),
    Production(_NP, ['John'], prob=0.1),
    Production(_NP, ['I'], prob=0.15),
    Production(_Det, ['the'], prob=0.8),
    Production(_Det, ['my'], prob=0.2),
    Production(_N, ['dog'], prob=0.5),
    Production(_N, ['cookie'], prob=0.5),
    Production(_VP, [_VP, _PP], prob=0.1),
    Production(_VP, [_V, _NP], prob=0.7),
    Production(_VP, [_V], prob=0.2),
    Production(_V, ['ate'], prob=0.35),
Example #9
0
    pcount = {}  # Production count: the number of times a given production occurs
    lcount = {}  # LHS-count: counts the number of times a given lhs occurs

    for prod in productions:
        lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
        pcount[prod] = pcount.get(prod, 0) + 1

    prods = [Production(p.lhs(), p.rhs(), prob=float(pcount[p]) / lcount[p.lhs()]) for p in pcount]
    return Grammar(start, prods)


#################################################################
# Toy PCFGs
#################################################################

_S, _VP, _NP, _PP = cfg.nonterminals("S, VP, NP, PP")
_V, _N, _P, _Name, _Det = cfg.nonterminals("V, N, P, Name, Det")

toy1 = Grammar(
    _S,
    [
        Production(_NP, [_Det, _N], prob=0.5),
        Production(_NP, [_NP, _PP], prob=0.25),
        Production(_NP, ["John"], prob=0.1),
        Production(_NP, ["I"], prob=0.15),
        Production(_Det, ["the"], prob=0.8),
        Production(_Det, ["my"], prob=0.2),
        Production(_N, ["dog"], prob=0.5),
        Production(_N, ["cookie"], prob=0.5),
        Production(_VP, [_VP, _PP], prob=0.1),
        Production(_VP, [_V, _NP], prob=0.7),
Example #10
0
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk_lite 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk_lite.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string

        """
        1. Tokenize
        ------------------------------------------------------------------------
        """

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp(self.string, re_all)

        """
        2. Develop a context free grammar
        ------------------------------------------------------------------------
        """

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = cfg.nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]),
            
            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")" 
            cfg.Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            cfg.Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            cfg.Production(H, [D, '/', O])
        )

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$') 
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = cfg.Production(D,[tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = cfg.Grammar(O, productions)
        rd_parser = parse.RecursiveDescent(grammar)
       
        # Tokens need to be redefined. 
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp(self.string, re_all)
        toklist = list(tokens)

        """
        3. Parse using the context free grammar
        ------------------------------------------------------------------------
        """
        # Store the parsing. 
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError: 
            print "Could not parse query."
            return


        """
        4. Refine and convert to a Tree representation
        ------------------------------------------------------------------------
        """
        # Set the nltk_lite.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","")
        self.nltktree = parse.tree.bracket_parse(string2)
        
        # Store the resulting nltk_lite.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()
Example #11
0
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk_lite 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk_lite.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string
        """
        1. Tokenize
        ------------------------------------------------------------------------
        """

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp(self.string, re_all)
        """
        2. Develop a context free grammar
        ------------------------------------------------------------------------
        """

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = cfg.nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            cfg.Production(O, [D]),
            cfg.Production(O, [H]),
            cfg.Production(O, [T]),

            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")"
            cfg.Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            cfg.Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            cfg.Production(H, [D, '/', O]))

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$')
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = cfg.Production(D, [tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = cfg.Grammar(O, productions)
        rd_parser = parse.RecursiveDescent(grammar)

        # Tokens need to be redefined.
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp(self.string, re_all)
        toklist = list(tokens)
        """
        3. Parse using the context free grammar
        ------------------------------------------------------------------------
        """
        # Store the parsing.
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError:
            print "Could not parse query."
            return
        """
        4. Refine and convert to a Tree representation
        ------------------------------------------------------------------------
        """
        # Set the nltk_lite.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":", "").replace("')'", "").replace(
            "table(", "").replace("','", "").replace("'", "").replace("/", "")
        self.nltktree = parse.tree.bracket_parse(string2)

        # Store the resulting nltk_lite.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()