Ejemplo n.º 1
0
 def _expand_node(self, production: nltk.Production):
     current_node = self.stack.pop()
     if production.lhs() == current_node.label():
         self._append(current_node, production.rhs())
         self.actions.append(production)
     else:
         self.stack.append(current_node)
         raise ValueError(
             f'Rule is not applicable: {production}, stack: {self.stack}.')
Ejemplo n.º 2
0
def handle_singles(rule):
    p = cfg_.productions(rule.rhs()[0])
    #                 if (rule.lhs() == Nonterminal('SIGMA')):
    #                     print(p)
    for i in p:
        if (len(i) == 1):
            if (nltk.grammar.is_nonterminal(i.rhs()[0]) == False):
                newrules.append(Production(rule.lhs(), i.rhs()))
            else:
                handle_singles(Production(rule.lhs(), i.rhs()))
        else:
            prod = Production(rule.lhs(), i.rhs())
            if prod not in allrules:
                allrules.append(prod)
Ejemplo n.º 3
0
def test1():
    nt1 = Nonterminal('NP')
    nt2 = Nonterminal('VP')
     	
    print nt1.symbol()
     	
    S, NP, VP, PP = nonterminals('S, NP, VP, PP')
    N, V, P, DT = nonterminals('N, V, P, DT')
     	
    prod1 = Production(S, [NP, VP])
    prod2 = Production(NP, [DT, NP])
     	
    print prod1.lhs() 	
    print prod1.rhs() 	
    print prod1 == Production(S, [NP, VP]) 	
    print prod1 == prod2
Ejemplo n.º 4
0
def train_grammar(unknown_words=[], nb_reduced_production=6000):

    productions = []

    for item in train:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)  # Remove branches A-B-C into A-B+C
            tree.chomsky_normal_form(horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
            #tree_prods = tree.productions()


            productions += tree.productions()


    counter = collections.Counter(productions)
    n_comms = [item for item, count in counter.most_common(nb_reduced_production) for i in range(count)]

    #Adding unkwown words and terminal rules back into the reduced productions set
    unknown_words_prods = []
    for p in productions:
        if isinstance(p._rhs[0], str):
            unknown_words_prods.append(p)
            for u in unknown_words:
                rhs = [u]
                lhs = p._lhs
                new_prod = Production(lhs, rhs)
                unknown_words_prods.append(new_prod)


    n_comms += unknown_words_prods
    S = Nonterminal('S')
    grammar = induce_pcfg(S, n_comms)

    return grammar
def update_complete_chart(chart, tokens, grammar, trace=False):
    """Updates non-diagonal elements of chart

    Arguments:
    ----------
        chart (list):
            List of list containing chart algorithm elements
        tokens (list):
            List of words in input sentence
        grammar (list):
            List of production rules in the grammar
    """
    index = dict((p.rhs(), p.lhs()) for p in grammar.productions())
    num_tokens = len(tokens)
    for span in range(2, num_tokens + 1):
        for start in range(num_tokens + 1 - span):
            end = start + span
            temp_categories, temp_rules = [], []
            for mid in range(start + 1, end):
                nt1s, nt2s = chart[start][mid], chart[mid][end]
                if len(nt1s) != 0 and len(nt2s) != 0:
                    for nt1 in nt1s[0]:
                        for nt2 in nt2s[0]:
                            if nt1 and nt2 and (nt1, nt2) in index:
                                p = Production(
                                    index[(nt1, nt2)],
                                    (Nonterminal(nt1), Nonterminal(nt2)))
                                temp_rules.append(f'{p._lhs} -> {p._rhs}')
                                temp_categories.append(index[(nt1, nt2)])
            chart[start][end] = [(temp_categories[i], temp_rules[i], mid)
                                 for i in range(len(temp_rules))]
    return chart
Ejemplo n.º 6
0
 def _generate_production(self, t):
     arr = []
     for i in range(len(t)):
         if type(t[i]) == str:
             arr.append(t[i])
         else:
             arr.append(Nonterminal(t[i].label()))
     return Production(Nonterminal(t.label()), tuple(arr))
Ejemplo n.º 7
0
def update_grammar(productions, unknown):
    lis = pos_tagger.tag(unknown)
    for i in range(len(lis)):
        pos = nonterminals(lis[i][1])[0]
        production_ = Production(pos, [unknown[i]])
        productions.append(production_)
        print production_, "added to productions"

    S = Nonterminal('SENT')
    grammar = induce_pcfg(S, productions)

    return grammar
Ejemplo n.º 8
0
    def parse_productions(self, parse_tree, parent_label='', parent_annotation_level='non-preterminal'):
        """
        :type parse_tree: nltk.Tree
        :type parent_label: str
        :param parent_annotation_level: Should be one of 'all' and 'non-preterminal'
                                        For the start_symbol, parent annotate as <original>_Parent_NULL
                                        if 'all' -> Parent annotate all nonterminals as <original>_Parent_<parent>
                                        if 'non-preterminal' -> Parent annotate only non-preterminals
        :type parent_annotation_level: str
        :rtype productions: list(nltk.Production)
        """
        if not parse_tree:
            return []
        elif len(parse_tree) == 1:
            if parent_annotation_level == 'non-preterminal':
                updated_lhs = Nonterminal(parse_tree.label())
            elif parent_annotation_level == 'all':
                updated_lhs = Nonterminal(parse_tree.label() + '_Parent_' + parent_label)
            else:
                updated_lhs = Nonterminal(parse_tree.label())
            rhs = [parse_tree[0]]
            return [Production(lhs=updated_lhs, rhs=rhs)]

        productions = []
        updated_rhs = []
        for i in parse_tree:
            if parent_annotation_level == 'all':
                updated_rhs.append(Nonterminal(i.label() + '_Parent_' + parse_tree.label()))
            elif parent_annotation_level == 'non-preterminal' and len(i) == 1:
                updated_rhs.append(Nonterminal(i.label()))
            else:
                updated_rhs.append(Nonterminal(i.label() + '_Parent_' + parse_tree.label()))
            productions += self.parse_productions(parse_tree=i, parent_label=parse_tree.label())

        if not parent_label:
            parent_label = 'NULL'

        updated_lhs = Nonterminal(parse_tree.label() + '_Parent_' + parent_label)
        productions = [Production(lhs=updated_lhs, rhs=updated_rhs)] + productions
        return productions
Ejemplo n.º 9
0
def create_grammar(x_train):
    productions = []
    for x in x_train:
        for tree in treebank.parsed_sents(x):
            # tree.collapse_unary(collapsePOS = True)
            tree.chomsky_normal_form()
            productions += tree.productions()

    S = Nonterminal('S')
    for w in ['CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD','NN','NNS','NNP','NNPS','PDT','POS','PRP','PRP','RB','RBR','RBS','RP','TO','UH','VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WP','WRB', 'NP' ]:
        productions.append(Production(Nonterminal(w), ('<UNK>', )))
    
    grammar = create_pcfg(S, productions)
    return grammar
Ejemplo n.º 10
0
def handle_long(rule, idx):
    if (len(rule) > 2):
        rh = list(rule.rhs())
        lh = checkinrhs(rh[-2], rh[-1])
        if (lh == None):
            z = Nonterminal("Z" + str(idx))
            newrules.append(Production(z, (rh[-2], rh[-1])))
            rh = rh[:-2]
            rh.append(z)
            newrule = Production(rule.lhs(), rh)

        else:
            rh = rh[:-2]
            rh.append(lh)
            newrule = Production(rule.lhs(), rh)


#         newrule = Production(rh[:-2])
        return (handle_long(newrule, idx + 1))

    if (len(rule) == 2):
        newrules.append(rule)
        return (idx - 1)
Ejemplo n.º 11
0
def generate_impacts_grammar(attribute, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ART'))),
        Production(Nonterminal('ART'),
                   ('the impact of the incident', Nonterminal('V2'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('was', Nonterminal('ATTR')))
    else:
        v2 = Production(Nonterminal('V2'), ('was not', Nonterminal('ATTR')))
    attribute = Production(Nonterminal('ATTR'),
                           (attribute, Nonterminal('END')))
    gr.append(v2)
    gr.append(attribute)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
Ejemplo n.º 12
0
def add_rules(nonterminal_name: Text,
              list_terminals: Sequence[Text]) -> Sequence[Production]:
    """Create the production rules for a givn nonterminal and a
   list of terminals corresponding to it.
  Arguments:
    nonterminal_name: The name of the nonterminal.
    list_terminals: The list of terminals that for each one a rule with
    the nonterminal will be produced.
  Returns:
    A sequence of productions rules.
  """
    prods = []
    for phrase in list_terminals:
        rule = Production(Nonterminal(nonterminal_name), (phrase, ))
        prods.append(rule)
    return prods
Ejemplo n.º 13
0
Archivo: pcky.py Proyecto: zyocum/pcfg
 def build_tree(self, back, row, col, root):
     """Given a back-pointer matrix, a row/column entry point into the
     back-pointer matrix, and the root label, recursively builds and returns 
     the most probable syntactic parse tree rooted at the entry point."""
     a = self.index[root]
     # Base case - lexical productions
     if root in (Production.lhs(n) for n in self.terminals()):
         return Tree(root, [back[row + 1, row + 1, a]])
     # Recursive case - nonlexical productions
     else:
         try:
             k, b, c = back[row, col, a]
             left, right = [back, row, k, b], [back, k, col, c]
             return Tree(root,
                         [self.build_tree(*left),
                          self.build_tree(*right)])
         except TypeError:
             # In case the input is unlicensed by the PCFG
             return Tree(None, [])
         except Exception as e:
             raise e
Ejemplo n.º 14
0
def cfg_demo():
    """
    A demonstration showing how C{ContextFreeGrammar}s can be created and used.
    """

    from nltk import nonterminals, Production, parse_cfg

    # Create some nonterminals
    S, NP, VP, PP = nonterminals('S, NP, VP, PP')
    N, V, P, Det = nonterminals('N, V, P, Det')
    VP_slash_NP = VP / NP

    print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP]
    print '    S.symbol() =>', ` S.symbol() `
    print

    print Production(S, [NP])

    # Create some Grammar Productions
    grammar = parse_cfg("""
      S -> NP VP
      PP -> P NP
      NP -> Det N | NP PP
      VP -> V NP | VP PP
      Det -> 'a' | 'the'
      N -> 'dog' | 'cat'
      V -> 'chased' | 'sat'
      P -> 'on' | 'in'
    """)

    print 'A Grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 25)
    print

    print 'Coverage of input words by a grammar:'
    print grammar.covers(['a', 'dog'])
    print grammar.covers(['a', 'toy'])
Ejemplo n.º 15
0
    def _gp_lexicalized_rules(self, ptree):
        """Helper method to extract grandparent nodes and the production rules
            from the tree

        Parameters
        ------------
        ptree : nltk Tree object

        Returns
        ---------
        production_rules : list
            add grand-parent nodes to prodcution ruels with '^'

        """

        if not isinstance(ptree._label, string_types):
            raise TypeError('Productions can only be generated from trees having node labels that are strings')
        prods = []
        for child in ptree.subtrees():
            if child.parent():
                prods += [Production(Nonterminal(child.parent().label() + '^' + child._label), _child_names(child))]

        return prods
Ejemplo n.º 16
0
import nltk
from nltk import Nonterminal, nonterminals, Production, CFG
nonterminal1 = Nonterminal('NP')
nonterminal2 = Nonterminal('VP')
nonterminal3 = Nonterminal('PP')
print(nonterminal1.symbol())
print(nonterminal2.symbol())
print(nonterminal3.symbol())
print(nonterminal1==nonterminal2)
print(nonterminal2==nonterminal3)
print(nonterminal1==nonterminal3)
S, NP, VP, PP = nonterminals('S, NP, VP, PP')
N, V, P, DT = nonterminals('N, V, P, DT')
production1 = Production(S, [NP, VP])
production2 = Production(NP, [DT, NP])
production3 = Production(VP, [V, NP,NP,PP])
print(production1.lhs())
print(production1.rhs())
print(production3.lhs())
print(production3.rhs())
print(production3 == Production(VP, [V,NP,NP,PP]))
print(production2 == production3)

Ejemplo n.º 17
0
Archivo: pcky.py Proyecto: zyocum/pcfg
 def nonterminals(self):
     """Returns productions of the form A -> B C."""
     return (p for p in self.grammar.productions()
             if Production.is_nonlexical(p))
Ejemplo n.º 18
0
Archivo: cfg.py Proyecto: Geolem/nltk
def demo2():
    from nltk import Nonterminal, Production, CFG

    nonterminals = "S VP NP PP P N Name V Det"
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ["up", "over", NP]),
        # Lexical Productions
        Production(NP, ["I"]),
        Production(Det, ["the"]),
        Production(Det, ["a"]),
        Production(N, ["man"]),
        Production(V, ["saw"]),
        Production(P, ["in"]),
        Production(P, ["with"]),
        Production(N, ["park"]),
        Production(N, ["dog"]),
        Production(N, ["statue"]),
        Production(Det, ["my"]),
    )
    grammar = CFG(S, productions)

    text = "I saw a man in the park".split()
    d = CFGDemo(grammar, text)
    d.mainloop()
Ejemplo n.º 19
0
Archivo: cfg.py Proyecto: wrand/tweater
def demo3():
    from nltk import Production
    (S, VP, NP, PP, P, N, Name, V, Det) = \
        nonterminals('S, VP, NP, PP, P, N, Name, V, Det')

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ['up', 'over', NP]),

        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )

    t = Tk()

    def destroy(e, t=t):
        t.destroy()

    t.bind('q', destroy)
    p = ProductionList(t, productions)
    p.pack(expand=1, fill='both')
    p.add_callback('select', p.markonly)
    p.add_callback('move', p.markonly)
    p.focus()
    p.mark(productions[2])
    p.mark(productions[8])
Ejemplo n.º 20
0
# -*- coding: utf-8 -*-
"""
Created on Wed Dec  9 13:04:57 2020

@author: Rahul Kothuri, Isak Nyberg
"""
import nltk
from nltk import Nonterminal, nonterminals, Production, CFG
w1 = Nonterminal("NP")
w2 = Nonterminal("VP")
S, NP, VP = nonterminals('S,NP,VP')
NLN, LN, V, LNP, DT, VBP, Adj, VBZ, RB = nonterminals(
    'NLN,LN,V,LNP, DT, VBP, Adj,VBZ,RB')
prod1 = Production(S, [NP, VP])
prod2 = Production(NP, [DT, NP])
grammar = CFG.fromstring("""
    S ->  NP VP 
    NP -> Det LN | Det NLN | Det LNP
    VP -> V NP | VBP Adj | VBZ Adj | V RB | V | VBZ NP
    Det -> 'The'
    Det -> 'A'
    Det -> 'the'
    Det -> 'that'
    Det -> 'Those'
    LN -> 'girl' | 'boy' | 'dog' 
    LNP -> 'boys'
    NLN -> 'house' | 'crackers'
    V -> 'eats'
    V -> 'run' | 'runs'
    VBP -> 'are'
    VBZ -> 'is'
Ejemplo n.º 21
0
def parse_production(line, nonterm_parser, probabilistic=False):
    pos = 0

    # Parse the left-hand side.
    lhs, pos = nonterm_parser(line, pos)

    # Skip over the arrow.
    m = re.compile('\s*->\s*').match(line, pos)
    if not m: raise ValueError('Expected an arrow')
    pos = m.end()

    # Parse the right hand side.
    probabilities = [0.0]
    found_terminal = found_non_terminal = False
    rhsides = [[]]
    while pos < len(line):
        # Probability.
        m = re.compile('(\[[\d\.]+\])\s*').match(line, pos)
        if probabilistic and m:
            pos = m.end()
            probabilities[-1] = float(m.group(1)[1:-1])
            if probabilities[-1] > 1.0:
                raise ValueError('Production probability %f, '
                                 'should not be greater than 1.0' %
                                 (probabilities[-1], ))

        # String -- add terminal.
        elif line[pos] in "\'\"":
            m = re.compile('("[^"]+"|' + "'[^']+')\s*").match(line, pos)
            if not m: raise ValueError('Unterminated string')
            if found_terminal:
                raise ValueError('Bad right-hand-side: do not use '
                                 'a sequence of terminals')
            found_terminal = True
            rhsides[-1].append(m.group(1)[1:-1])
            pos = m.end()

        # Vertical bar -- start new rhside.
        elif line[pos] == '|':
            probabilities.append(0.0)
            found_terminal = found_non_terminal = False
            rhsides.append([])
            pos = re.compile('\\|\s*').match(line, pos).end()

        # Anything else -- nonterminal.
        else:
            nonterm, pos = nonterm_parser(line, pos)
            rhsides[-1].append(nonterm)
            found_non_terminal = True

        if found_terminal and found_non_terminal:
            raise ValueError('Bad right-hand-side: do not mix '
                             'terminals and non-terminals')

    if probabilistic:
        return [
            WeightedProduction(lhs, rhs, prob=probability)
            for (rhs, probability) in zip(rhsides, probabilities)
        ]
    else:
        return [Production(lhs, rhs) for rhs in rhsides]
Ejemplo n.º 22
0
def demo(should_print_times=True, trace=1):
    import sys, time

    S = FeatStructNonterminal('S')
    VP = FeatStructNonterminal('VP')
    NP = FeatStructNonterminal('NP')
    PP = FeatStructNonterminal('PP')
    V = FeatStructNonterminal('V')
    N = FeatStructNonterminal('N')
    P = FeatStructNonterminal('P')
    Name = FeatStructNonterminal('Name')
    Det = FeatStructNonterminal('Det')
    DetSg = FeatStructNonterminal('Det[-pl]')
    DetPl = FeatStructNonterminal('Det[+pl]')
    NSg = FeatStructNonterminal('N[-pl]')
    NPl = FeatStructNonterminal('N[+pl]')

    productions = [
        # Define some grammatical productions.
        Production(S, (NP, VP)),  Production(PP, (P, NP)),
        Production(NP, (NP, PP)),
        Production(VP, (VP, PP)), Production(VP, (V, NP)),
        Production(VP, (V,)),     Production(NP, (DetPl, NPl)),
        Production(NP, (DetSg, NSg)),
        # Define some lexical productions.
        Production(NP, ('John',)), Production(NP, ('I',)),
        Production(Det, ('the',)), Production(Det, ('my',)),
        Production(Det, ('a',)),
        Production(NSg, ('dog',)), Production(NSg, ('cookie',)),
        Production(V, ('ate',)),   Production(V, ('saw',)),
        Production(P, ('with',)),  Production(P, ('under',)),
    ]

    earley_grammar = ContextFreeGrammar(S, productions)
    print earley_grammar
    print

    sent = 'I saw John with a dog with my cookie'
    print "Sentence:", 
    print sent
    print
    tokens = sent.split()
    t = time.time()
    cp = FeatureEarleyChartParser(earley_grammar, trace=trace)
    trees = cp.nbest_parse(tokens)
    print
    if should_print_times:
        print "Time: %s" % (time.time() - t)
    for tree in trees: print tree
Ejemplo n.º 23
0
            try:
                tup = (list1[i], list2[i])
            except IndexError:
                if len(list1) > len(list2):
                    list2.append('')
                    tup = (list1[i], list2[i])
                elif len(list1) < len(list2):
                    list1.append('')
                    tup = (list1[i], list2[i])
                continue

            merged_list.append(tup)
            break
    return merged_list


PosTuple = merge(pos, d)

for item1, item2 in PosTuple:
    p = Production(Nonterminal(str(item1)), [str(item2)])

CFGgrammar.append(p)


def sentenceparse(sent):
    rd_parser = nltk.RecursiveDescentParser(CFGgrammar)
    trees = rd_parser.parse(sent.split())
    treelist = list(trees)
    for tree in treelist:
        print(tree)
Ejemplo n.º 24
0
def create_templates():
    """Creates the templates from the grammar."""

    prods = [
        # Specific verb with goal and the rest of instruction body.
        Production(Nonterminal('S'),
                   (Nonterminal('V2'), Nonterminal('V2_BODY'))),
        # A verb and rest of the instruction body assuming goal already mentioned.
        Production(Nonterminal('V2_BODY'),
                   (Nonterminal('V1'), Nonterminal('M_G_ALREADY_V'))),
        # A verb and the rest of the instruction body assuming the goal wasn't
        # mentioned before.
        Production(Nonterminal('S'),
                   (Nonterminal('V1'), Nonterminal('NO_GOAL'))),
        # The goal in the begining and the rest of the instruction body assuming
        # goal already mentioned.
        Production(Nonterminal('S'),
                   (Nonterminal('V1_GOAL'), Nonterminal('WITH_GOAL'))),
        # A verb and 'to the' and then goal mention and the rest of the instruction
        # body.
        Production(Nonterminal('V1_GOAL'),
                   (Nonterminal('V1'), Nonterminal('V1_CON'))),
        # A goal mention and the rest of the instruction body.
        Production(Nonterminal('WITH_GOAL'),
                   (Nonterminal('GOAL'), Nonterminal('M_G'))),
        # Main part of the instruction without verb in begining and resuming
        # sentence.
        Production(
            Nonterminal('M_G_ALREADY_V'),
            (Nonterminal('MAIN_NO_V'), Nonterminal('END_NEAR_GOAL_KNOWN'))),
        # # Main part of the instruction, adding a new sentence.
        Production(Nonterminal('M_G'),
                   (Nonterminal('MAIN'), Nonterminal('END_NEAR_GOAL_KNOWN'))),
        # End part - (1) near pivot assuming goal already mentioned; and (2) avoid
        # sentence.
        Production(Nonterminal('END_NEAR_GOAL_KNOWN'),
                   (Nonterminal('NEAR_GOAL_START'), Nonterminal('AVOID'))),
        # End part - (1) near pivot assuming goal not mentioned yet; and (2) avoid
        # sentence.
        Production(Nonterminal('END_NEAR_GOAL_KNOWN'),
                   (Nonterminal('NEAR_GOAL_END'), Nonterminal('AVOID'))),
        # Main part of the instruction without verb in begining and resuming
        # sentence assuming no goal mentioned before.
        Production(
            Nonterminal('NO_GOAL'),
            (Nonterminal('MAIN_NO_V'), Nonterminal('END_NEAR_GOAL_UNKNOWN'))),
        # Add Goal to main part and then resume instruction by adding an
        # ending(near+avoid).
        Production(
            Nonterminal('END_NEAR_GOAL_UNKNOWN'),
            (Nonterminal('GOAL_END'), Nonterminal('END_NEAR_GOAL_KNOWN'))),
        # Add Goal with near and then add an avoid sentenece.
        Production(Nonterminal('END_NEAR_GOAL_UNKNOWN'),
                   (Nonterminal('NEAR_GOAL_END'), Nonterminal('AVOID'))),
        # Termial for IN+DT after verb.
        Production(Nonterminal('V1_CON'), ('to the', )),
    ]

    prods += add_rules('V2', V2)
    prods += add_rules('AVOID', AVOID)
    prods += add_rules('NEAR_GOAL_START', NEAR_GOAL_START)
    prods += add_rules('NEAR_GOAL_END', NEAR_GOAL_END)
    prods += add_rules('GOAL', GOAL)
    prods += add_rules('GOAL_END', GOAL_END)
    prods += add_rules('MAIN_NO_V', MAIN_NO_V)
    prods += add_rules('MAIN', MAIN)
    prods += add_rules('V1', V1)

    grammar = CFG(Nonterminal('S'), prods)

    # Generate templates.
    templates = []
    for sentence in nltk.parse.generate.generate(grammar):

        sentence = ' '.join(sentence)

        if sentence[-1] != '.':
            sentence += '.'
        sentence = sentence.replace(" .", ".")
        sentence = sentence.replace(" ,", ",")
        sentence = sentence.replace("..", ".")

        re_space = re.compile(r'[\s]+')
        sentence = re_space.sub(r' ', sentence)

        templates.append(sentence)

    templates_df = pd.DataFrame(templates,
                                columns=['sentence']).drop_duplicates()
    # Save templates
    templates_df.to_csv('templates.csv', index=False, header=False)

    # Flag features.
    for column in STREET_FEATURES:
        templates_df[column] = templates_df['sentence'].apply(
            lambda x: column.upper() in x)

    return templates_df
Ejemplo n.º 25
0
    终结符的有限集合(T)
    开始符号(S)
    产生式的有限集合(P),形如:A->a
"""
# 非终结符
nonterminal1 = Nonterminal('NP')
nonterminal2 = Nonterminal('VP')
nonterminal3 = Nonterminal('PP')
print((nonterminal1 == nonterminal2))
print((nonterminal2 == nonterminal3))
print((nonterminal1 == nonterminal3))

S, NP, VP, PP = nonterminals('S, NP, VP, PP')
N, V, P, DT = nonterminals('N, V, P, DT')
# 产生式
production1 = Production(S, [NP, VP])
production2 = Production(NP, [DT, NP])
production3 = Production(VP, [V, NP, NP, PP])
print(production1.lhs(), production1.rhs())
print(production2.lhs(), production2.rhs())
print(production3.lhs(), production3.rhs())

# 语法解析
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
# print(gram1)
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent = sent[25]
sent = testingsent[0]
"""FAQ. 递归下降分析 增量式 earley算法
通过保存增量解析步骤的结果和确保每一个解析函数在同一个输入位置只被调用一次,就可以把任意解析表达文法转化成一个Packrat Parser,
Ejemplo n.º 26
0
def generate_events_grammar(attribute, parent, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ART'))),
        Production(Nonterminal('ATTR'), (attribute, Nonterminal('END'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if parent is not None:
        art = Production(Nonterminal('ART'), ('the', Nonterminal('PAR')))
        par = Production(Nonterminal('PAR'), (parent, Nonterminal('V2')))
    else:
        art = Production(Nonterminal('ART'), ('the', Nonterminal('PAR')))
        par = Production(
            Nonterminal('PAR'),
            ('events that caused the incident', Nonterminal('V2')))
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('included', Nonterminal('ATTR')))
    else:
        v2 = Production(Nonterminal('V2'),
                        ('did not include', Nonterminal('ATTR')))
    gr.append(art)
    gr.append(par)
    gr.append(v2)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
Ejemplo n.º 27
0
Archivo: cfg.py Proyecto: wrand/tweater
def demo2():
    from nltk import Nonterminal, Production, ContextFreeGrammar
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ['up', 'over', NP]),

        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )
    grammar = ContextFreeGrammar(S, productions)

    text = 'I saw a man in the park'.split()
    d = CFGDemo(grammar, text)
    d.mainloop()
Ejemplo n.º 28
0
def generate_sources_grammar(attribute, parent, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ART'))),
        Production(Nonterminal('ATTR'), (attribute, Nonterminal('END'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('included', Nonterminal('ATTR')))
    else:
        v2 = Production(Nonterminal('V2'),
                        ('didn´t include', Nonterminal('ATTR')))
    if parent is None:
        article = Production(Nonterminal('ART'), ('the', Nonterminal('CLS')))
        parent = Production(Nonterminal('CLS'), ('sources', Nonterminal('V2')))
    else:
        article = Production(Nonterminal('ART'), ('the', Nonterminal('PAR')))
        parent = Production(Nonterminal('PAR'), (parent, Nonterminal('V2')))
    gr.append(v2)
    gr.append(article)
    gr.append(parent)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
Ejemplo n.º 29
0
                ta /= len(list_tag_val)
                
                # armazena o resultado
                r = {'lp':lp, 'lr': lr, 'f1':f1, 'ta':ta}
                resultados.append(r)
            else:
                print("Sentença com mais de 18 palavras.")
        except Exception:
            print("Árvore mal formada.")

    # realiza o calculo da media para cada metrica
    media_lp = sum(item['lp'] for item in resultados)/len(resultados)
    media_lr = sum(item['lr'] for item in resultados)/len(resultados)
    media_f1 = sum(item['f1'] for item in resultados)/len(resultados)
    media_ta = sum(item['ta'] for item in resultados)/len(resultados)
    print("media_lp",media_lp,"media_lr",media_lr,"media_f1",media_f1,"media_ta",media_ta)

# extrai as arvores da base de dados floresta, com suas respectivas tags
filter_errors(floresta.parsed_sents())

roots = []
ROOT = Nonterminal('ROOT') # nao-terminal representado o simbolo inicial da gramatica
initial_symbols = list(set(initial_symbols)) # remover repetidos
for t in initial_symbols:
    roots += [Production(ROOT,[t])] # unificar a gramatica para apenas um simbolo inicial

productions += roots
productions += [Production(Nonterminal("n"), ["UNK"])] # regra para palavras desconhecidas (substantivo)

pcfg = induce_pcfg(ROOT, productions) # cria a PCFG informando o simbolo inicial e as regras
do_cky(pcfg) # aplica o algoritmo CKY (ViterbiParser)
Ejemplo n.º 30
0
def generate_entities_grammar(attribute, phase):
    gr = [
        Production(Nonterminal('S'), (Nonterminal('AUX1'), )),
        Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))),
        Production(Nonterminal('S1'), ('you', Nonterminal('V1'))),
        Production(Nonterminal('V1'), ('think', Nonterminal('ATTR'))),
        Production(Nonterminal('V3'), ('impacted', Nonterminal('OBJ'))),
        Production(Nonterminal('V3'), ('affected', Nonterminal('OBJ'))),
        Production(Nonterminal('OBJ'),
                   ('by the incident', Nonterminal('END'))),
        Production(Nonterminal('END'), ('?', ))
    ]
    if phase == 1:
        v2 = Production(Nonterminal('V2'), ('are', Nonterminal('V3')))
    else:
        v2 = Production(Nonterminal('V2'), ('are not', Nonterminal('V3')))
    attribute = Production(Nonterminal('ATTR'), (attribute, Nonterminal('V2')))
    gr.append(v2)
    gr.append(attribute)
    grammar = CFG(Nonterminal('S'), gr)
    return grammar
Ejemplo n.º 31
0
Archivo: cfg.py Proyecto: Geolem/nltk
def demo3():
    from nltk import Production

    (S, VP, NP, PP, P, N, Name, V,
     Det) = nonterminals("S, VP, NP, PP, P, N, Name, V, Det")

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ["up", "over", NP]),
        # Lexical Productions
        Production(NP, ["I"]),
        Production(Det, ["the"]),
        Production(Det, ["a"]),
        Production(N, ["man"]),
        Production(V, ["saw"]),
        Production(P, ["in"]),
        Production(P, ["with"]),
        Production(N, ["park"]),
        Production(N, ["dog"]),
        Production(N, ["statue"]),
        Production(Det, ["my"]),
    )

    t = Tk()

    def destroy(e, t=t):
        t.destroy()

    t.bind("q", destroy)
    p = ProductionList(t, productions)
    p.pack(expand=1, fill="both")
    p.add_callback("select", p.markonly)
    p.add_callback("move", p.markonly)
    p.focus()
    p.mark(productions[2])
    p.mark(productions[8])
Ejemplo n.º 32
0
Archivo: pcky.py Proyecto: zyocum/pcfg
 def __init__(self, grammar):
     super(PCKYParser, self).__init__()
     self.grammar = self.load_grammar(grammar)
     self.index = CodeBook(
         {Production.lhs(p)
          for p in self.grammar.productions()})