Esempio n. 1
0
def demo3():
    from nltk import Production

    (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals(
        'S, VP, NP, PP, P, N, Name, V, Det'
    )

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ['up', 'over', NP]),
        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )

    t = Tk()

    def destroy(e, t=t):
        t.destroy()

    t.bind('q', destroy)
    p = ProductionList(t, productions)
    p.pack(expand=1, fill='both')
    p.add_callback('select', p.markonly)
    p.add_callback('move', p.markonly)
    p.focus()
    p.mark(productions[2])
    p.mark(productions[8])
Esempio n. 2
0
def demo3():
    from nltk import Production

    (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals(
        'S, VP, NP, PP, P, N, Name, V, Det'
    )

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ['up', 'over', NP]),
        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )

    t = Tk()

    def destroy(e, t=t):
        t.destroy()

    t.bind('q', destroy)
    p = ProductionList(t, productions)
    p.pack(expand=1, fill='both')
    p.add_callback('select', p.markonly)
    p.add_callback('move', p.markonly)
    p.focus()
    p.mark(productions[2])
    p.mark(productions[8])
Esempio n. 3
0
def demo3():
    from nltk import Production

    (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals(
        "S, VP, NP, PP, P, N, Name, V, Det"
    )

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ["up", "over", NP]),
        # Lexical Productions
        Production(NP, ["I"]),
        Production(Det, ["the"]),
        Production(Det, ["a"]),
        Production(N, ["man"]),
        Production(V, ["saw"]),
        Production(P, ["in"]),
        Production(P, ["with"]),
        Production(N, ["park"]),
        Production(N, ["dog"]),
        Production(N, ["statue"]),
        Production(Det, ["my"]),
    )

    t = Tk()

    def destroy(e, t=t):
        t.destroy()

    t.bind("q", destroy)
    p = ProductionList(t, productions)
    p.pack(expand=1, fill="both")
    p.add_callback("select", p.markonly)
    p.add_callback("move", p.markonly)
    p.focus()
    p.mark(productions[2])
    p.mark(productions[8])
Esempio n. 4
0
def demo3():
    from nltk import Production

    (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals("S, VP, NP, PP, P, N, Name, V, Det")

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ["up", "over", NP]),
        # Lexical Productions
        Production(NP, ["I"]),
        Production(Det, ["the"]),
        Production(Det, ["a"]),
        Production(N, ["man"]),
        Production(V, ["saw"]),
        Production(P, ["in"]),
        Production(P, ["with"]),
        Production(N, ["park"]),
        Production(N, ["dog"]),
        Production(N, ["statue"]),
        Production(Det, ["my"]),
    )

    t = Tk()

    def destroy(e, t=t):
        t.destroy()

    t.bind("q", destroy)
    p = ProductionList(t, productions)
    p.pack(expand=1, fill="both")
    p.add_callback("select", p.markonly)
    p.add_callback("move", p.markonly)
    p.focus()
    p.mark(productions[2])
    p.mark(productions[8])
Esempio n. 5
0
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp_tokenize(self.string, re_all)

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            Production(O, [D]),
            Production(O, [H]),
            Production(O, [T]),

            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")"
            Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            Production(H, [D, '/', O]))

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$')
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = Production(D, [tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = ContextFreeGrammar(O, productions)
        rd_parser = parse.RecursiveDescentParser(grammar)

        # Tokens need to be redefined.
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp_tokenize(self.string, re_all)
        toklist = list(tokens)

        # Store the parsing.
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError:
            print "Could not parse query."
            return

        # Set the nltk.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":", "").replace("')'", "").replace(
            "table(", "").replace("','", "").replace("'", "").replace("/", "")
        self.nltktree = parse.tree.bracket_parse(string2)

        # Store the resulting nltk.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()
Esempio n. 6
0
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp_tokenize(self.string, re_all)

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            Production(O, [D]), Production(O, [H]), Production(O, [T]),
            
            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")" 
            Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            Production(H, [D, '/', O])
        )

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$') 
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = Production(D,[tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = ContextFreeGrammar(O, productions)
        rd_parser = parse.RecursiveDescentParser(grammar)
       
        # Tokens need to be redefined. 
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp_tokenize(self.string, re_all)
        toklist = list(tokens)

        # Store the parsing. 
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError: 
            print "Could not parse query."
            return

        # Set the nltk.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","")
        self.nltktree = parse.tree.bracket_parse(string2)
        
        # Store the resulting nltk.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()