Example #1
0
def create_taskgrammar(grammar, task, encoders):
    logger.info('Creating specific grammar for task %s' % task)
    productions = grammar.productions(Nonterminal(task))
    start_token = Nonterminal('S')
    new_productions = []

    for start_production in productions:
        first_token = start_production.rhs()[0]
        if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'):
            for new_start_production in grammar.productions(first_token):
                new_productions.append(Production(start_token, new_start_production.rhs()))
        else:
            new_productions.append(Production(start_token, start_production.rhs()))

    for production in grammar.productions():
        for new_production in new_productions:
            if production.lhs() in new_production.rhs() and production not in new_productions:
                if production.lhs().symbol() == 'ENCODERS':  # Use encoders only for types of features in the dataset
                    if len(encoders) > 0:
                        new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders]))
                    else:
                        new_productions.append(Production(production.lhs(), ['E']))
                else:
                    new_productions.append(production)

    task_grammar = CFG(start_token, new_productions)

    with open(TASK_GRAMMAR_PATH, 'w') as fout:
        fout.write('\n'.join([str(x) for x in task_grammar.productions()]))

    return task_grammar
Example #2
0
def process_hybrid_productions(productions):
    new_productions_list = []  # list of new productions
    to_remove_list = []
    # Hybrid production
    for p in productions:
        is_hybrid = 0  # flag that indicates if current production is hybrid
        if len(p.rhs()
               ) > 1:  # more than one symbols are on the right hand side
            rh_list = []  # new list for right hand symbols
            for r_symbol in p.rhs():
                if is_terminal(r_symbol):  # for terminal symbol
                    dummy_symbol = Nonterminal(
                        r_symbol)  # create dummy nonterminal
                    new_productions_list.append(
                        Production(dummy_symbol,
                                   [r_symbol]))  # new unit production
                    rh_list.append(dummy_symbol)
                    is_hybrid = 1  # hybrid production confirmed
                else:  # for nonterminal symbol
                    rh_list.append(r_symbol)
            if is_hybrid:  # need to remove original production and add some productions
                # in the loop, we won't change the list. Store them first.
                new_productions_list.append(Production(
                    p.lhs(), rh_list))  # new production with dummy symbol
                to_remove_list.append(p)
    return to_remove_list, new_productions_list
Example #3
0
def binarize(grammar):
    """Binarize grammar by introducing new nonterminals"""
    result = []

    for rule in grammar.productions():
        if len(rule.rhs()) > 2:
            # this rule needs to be broken down
            left_side = rule.lhs()
            symbol_names = [
                tsym.symbol() if not isinstance(tsym, str) else '@' + tsym
                for tsym in rule.rhs()
            ]
            for k in range(1, len(rule.rhs()) - 1):
                new_rhs_name = rule.lhs().symbol() + '|<' + '-'.join(
                    symbol_names[k:]) + '>'
                new_sym = Nonterminal(new_rhs_name)
                new_production = Production(left_side,
                                            (rule.rhs()[k - 1], new_sym))
                left_side = new_sym
                result.append(new_production)
            last_prd = Production(left_side, rule.rhs()[-2:])
            result.append(last_prd)
        else:
            result.append(rule)

    n_grammar = CFG(grammar.start(), result)
    return n_grammar
Example #4
0
def reinsert_unary_chains(tree, old_grammar):
  old_unary_productions = [p for p in old_grammar.productions() if len(p) == 1 and p.is_nonlexical()]

  nodeList = [tree]
  while nodeList != []:
    node = nodeList.pop()
    if not isinstance(node, Tree):
      continue
    
    assert len(node) <= 2

    nodeCopy = node.copy()
    children_rhs = [Nonterminal(child.label()) if not isinstance(child, str) else child for child in node]

    possibilities = []
    possibility = [Nonterminal(node.label())]
    query = Production(possibility[-1], children_rhs)
    while query not in old_grammar.productions():
      new_possibilities = [possibility + [p.rhs()[0]] for p in old_unary_productions if p.lhs() == possibility[-1]]
      possibilities.extend(new_possibilities)
      possibility = possibilities.pop(0)
      query = Production(possibility[-1], children_rhs)
      
    # Once a chain has been found, add it back in:
    node[0:] = [] # remove children
    lastnode = node
    for nt in possibility[1:]:
      newnode = Tree(nt.symbol(), [])
      lastnode[0:] = [newnode]
      lastnode = newnode
    lastnode[0:] = [child for child in nodeCopy]

    for child in lastnode:
      nodeList.append(child)
Example #5
0
def create_grammar() -> PCFG:
    # 21,763 productions with word terminals
    # 8,028 productions with pos terminals
    # 6,275 productions with nonterminals without digits
    # 5,402 productions with nonterminals without punctuation
    # 2,972 productions with nonterminals without suffixes
    # 707 nonterminals
    # 190 nonterminals without digit labels
    # 180 nonterminals without punctuation
    # 63 nonterminals without suffixes
    productions = []
    start_symbol = Nonterminal('S')
    for tree in nltk.corpus.treebank.parsed_sents():
        for production in tree.productions():
            if not valid_nonterminal(production.lhs()):
                continue
            if isinstance(production.rhs()[0], Nonterminal):
                lhs = simplify_nonterminal(production.lhs())
                rhs = tuple(
                    simplify_nonterminal(t) for t in production.rhs()
                    if valid_nonterminal(t))
                productions.append(Production(lhs, rhs))
            else:
                simplified = simplify_nonterminal(production.lhs())
                productions.append(
                    Production(simplified, (simplified.symbol(), )))

    grammar = nltk.induce_pcfg(start_symbol, productions)
    #print(grammar.productions())
    print(len(grammar.productions()))
    nonterminals = set(prod.lhs() for prod in grammar.productions())
    print(sorted(nonterminals))
    print(len(nonterminals))
    return grammar
Example #6
0
def convert_hybrid(grammar):
    '''
    Convert rules in the form of [A -> 'b' C] where the rhs has both non-terminals and terminals
    into rules in the form of [A -> B C] & [B -> 'b'] with a dummy non-terminal B
    '''
    rules = grammar.productions()
    new_rules = []
    for rule in rules:
        lhs = rule.lhs()
        rhs = rule.rhs()
        # check for hybrid rules
        if rule.is_lexical() and len(rhs) > 1:
            new_rhs = []
            for item in rule.rhs():
                if is_terminal(item):
                    new_sym = Nonterminal(item)
                    new_rhs.append(new_sym)
                    # add new lexical rule with dummy lhs nonterminal
                    new_rules.append(Production(new_sym, (item, )))
                else:
                    new_rhs.append(item)
            # add converted mixed rule with only non-terminals on rhs
            new_rules.append(Production(lhs, tuple(new_rhs)))
        else:
            new_rules.append(rule)

    new_grammar = CFG(grammar.start(), new_rules)

    return new_grammar
Example #7
0
    def test_production_from_grammar(self):
        grammar_str = """
        S -> NP VP
        PP -> P NP
        NP -> Det N | NP PP
        VP -> V NP | VP PP
        Det -> 'a' | 'the'
        N -> 'dog' | 'cat'
        V -> 'chased' | 'sat'
        P -> 'on' | 'in'
        """

        grammar = parse_cfg(grammar_str)
        productions = grammar.productions()

        expect_production = Production(
            lhs=Nonterminal("S"), rhs=[Nonterminal("NP"),
                                       Nonterminal("VP")])
        error_msg = "Expect to find '{}', but can not see in \n{}".format(
            expect_production, grammar_str)
        self.assertIn(expect_production, productions, error_msg)

        expect_production = Production(lhs=Nonterminal("N"), rhs=['dog'])
        error_msg = "Expect to find '{}', but can not see in \n{}".format(
            expect_production, grammar_str)
        self.assertIn(expect_production, productions, error_msg)

        expect_not_in = Production(lhs="S", rhs=["NP", "VP"])
        self.assertNotIn(expect_not_in, productions, error_msg)

        expect_not_in = Production(lhs=Nonterminal("N"), rhs=["'dog'"])
        self.assertNotIn(expect_not_in, productions, error_msg)
Example #8
0
def remove_unary_rules(grammar):
    """Remove unary nonterminal productions A -> B"""
    result = []
    unary = []
    fake_rules = []
    removed_rules = []
    for rule in grammar.productions():
        if len(rule) == 1 and rule.is_nonlexical():
            unary.append(rule)
        else:
            result.append(rule)

    while unary:
        rule = unary.pop(0)
        removed_rules.append(rule)
        for item in grammar.productions(lhs=rule.rhs()[0]):
            new_rule = Production(rule.lhs(), item.rhs())
            if len(new_rule) != 1 or new_rule.is_lexical():
                result.append(new_rule)
                fake_rules.append(new_rule)
            else:
                unary.append(new_rule)

    n_grammar = CFG(grammar.start(), result)
    return n_grammar, grammar
def train():
    print("Collecting sub-corpus from Penn Treebank (nltk.corpus)")
    
    # prepare parsing trees, extrated from treebank
    tbank_trees = []
    for sent in treebank.parsed_sents():
        sent.chomsky_normal_form()
        tbank_trees.append(sent)
    
    # build vocabulary list, extracted from treebank
    vocab_size = 10000 # set vocabulary size to 10000
    words = [wrd.lower() for wrd in treebank.words()]
    vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)]
    
    # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency
    tbank_productions = set(production for tree in tbank_trees for production in tree.productions())
    tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))
    production_rules = tbank_grammar.productions()
    rules_to_prob = defaultdict(int)
    nonterm_occurrence = defaultdict(int)
    
    #calculate probablity for rules
    for sent in tbank_trees:
        for production in sent.productions():
            if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal):
                production = Production(production.lhs(), [production.rhs()[0].lower()])
            nonterm_occurrence[production.lhs()] += 1
            rules_to_prob[production] += 1
    for rule in rules_to_prob:
        rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()]

    # use Katz smoothing
    rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab)
    rules = list(rules_to_prob.keys())
    rules_reverse_dict = dict((j,i) for i, j in enumerate(rules))
    left_rules = defaultdict(set)
    right_rules = defaultdict(set)
    unary_rules = defaultdict(set)
    
    # classify left, right rules
    for rule in rules:
        if len(rule.rhs()) > 1:
            left_rules[rule.rhs()[0]].add(rule)
            right_rules[rule.rhs()[1]].add(rule)
        else:
            unary_rules[rule.rhs()[0]].add(rule)
    terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str))
    terminal_nonterms = defaultdict(int)
    for rule in terminal_nonterms_rules:
        terminal_nonterms[rule.lhs()] += 1
        pcfg_parser = {
    'vocab': vocab,
        'left_rules': left_rules,
        'right_rules': right_rules,
        'unary_rules': unary_rules,
        'rules_to_prob': rules_to_prob,
        'terminal_nonterms': terminal_nonterms
    }
    return pcfg_parser
Example #10
0
def _binarize(p, so_far=[]):
    if len(p.rhs()) <= 2:
        so_far.append(p)
        return so_far
    else:
        new_nont = Nonterminal(p.lhs()._symbol + '_' + str(next(counter)))
        so_far.append(Production(p.lhs(), [p.rhs()[0], new_nont]))
        return _binarize(Production(new_nont, p.rhs()[1:]), so_far)
Example #11
0
def update_dictionary(lhs_dict, whole_dict, node):
    production = Production(Nonterminal(node.label()), get_child_names(node))
    if production.lhs() not in lhs_dict:
        lhs_dict[production.lhs()] = 0
    if production not in whole_dict:
        whole_dict[production] = 0

    lhs_dict[production.lhs()] += 1
    whole_dict[production] += 1
Example #12
0
def make_grammar(parse, mrepr='tokens-and-lemmas'):
    """
    Return a list of Productions on the basis of an output parse of L{MBMA}.
    MBMA returns parses in the following format::
    
        [('V|*V', 'ver'), ('V', 'eis'), ('INFLtWB', 't')]
        
    This is transformed into the following list of productions::
    
        [PRE:ver -> 'ver', V -> PRE:ver V, V -> 'eis', INFL:t -> 't', V -> V INFL:t]

    Args:
        - parse (list): a parse return by :func:`mbmp.MBMA.classify`

    Returns:
        list -- a list of Productions.
    """
    prods = []
    for morph in parse:
        pos, lemma = morph.pos, morph.lemma
        if pos.endswith('WB'):
            pos = pos[:-2]
        leaf = morph.pprint(mrepr)
        # tags with '|' split all non-lexical lemmas from lexical ones
        if '|' in pos:
            superpos, pos = pos.split('|')
            if pos.startswith('INFL'):
                nonterminalpos = 'INFL:%s' % lemma
                nonterms = [Nonterminal(nonterminalpos), Nonterminal(pos[-1])]
            elif pos.endswith('INFL'):
                nonterminalpos = 'INFL:%s' % lemma
                nonterms = [Nonterminal(pos[0]), Nonterminal(nonterminalpos)]
            elif pos.startswith('*'):  # it's a prefix
                nonterminalpos = 'PRE:%s' % lemma
                nonterms = nonterminals([nonterminalpos] + list(pos[1:]))
            elif pos.endswith(('*', '*WB')):  # it's a suffix
                pos = pos[:pos.find('*')]
                nonterminalpos = 'SUF:%s' % lemma
                nonterms = nonterminals((list(pos) + [nonterminalpos]))
            else:  # it's a linking element
                nonterminalpos = 'LE:%s' % lemma
                leidx = pos.find('*')
                nonterms = nonterminals(
                    list(pos[:leidx]) + [nonterminalpos] +
                    list(pos[leidx + 1:]))
            if 'x' in pos:
                prods.append(Production(Nonterminal('x'), [leaf]))
            prods.append(Production(Nonterminal(nonterminalpos), [leaf]))
            if nonterms:
                prods.append(Production(Nonterminal(superpos), nonterms))
        else:
            prods.append(Production(Nonterminal(pos), [leaf]))
    return prods
Example #13
0
    def __init__(self, lhs, rhs, cost):
        """
        Construct a new ``ProbabilisticProduction``.

        :param lhs: The left-hand side of the new ``ProbabilisticProduction``.
        :type lhs: Nonterminal
        :param rhs: The right-hand side of the new ``ProbabilisticProduction``.
        :type rhs: sequence(Nonterminal and terminal)
        :param prob: Probability parameters of the new ``ProbabilisticProduction``.
        """
        ImmutableProbabilisticMixIn.__init__(self, logprob=-cost)
        Production.__init__(self, lhs, rhs)
Example #14
0
def _remove_empty_productions(input_productions, letters):
    """Remove productions with empty right hand sides."""
    copied_prods = deepcopy(input_productions)
    

    #
    # Find all nonterminals that generate the emptry string.
    #
    # Basis: A nonterminal generates the empty string if it is the LHS of a
    # production thats RHS is empty.
    gen_empty = [prod.lhs() for prod in copied_prods
                 if len(prod.rhs()) == 0]
    N = len(gen_empty)

    # Induction:
    while True:
        for nonterm in gen_empty:
            for prod in copied_prods:
                if nonterm in prod.rhs():
                    better = list(prod.rhs())
                    better.remove(nonterm)
                    prod._rhs = tuple(better)

        gen_empty[:] = [prod.lhs() for prod in copied_prods 
                     if len(prod.rhs()) == 0]
        new_len = len(gen_empty)
        if new_len == N:
            break
        N = new_len

    print 'gen_empty', gen_empty
    # ADD NEW RULES
    new_prods = []
    productions = deepcopy(input_productions)
    for nonterm in gen_empty: 
        prods = [prod for prod in productions
                 if len(prod.rhs()) == 2 and nonterm in prod.rhs()]
        for prod in prods:
            rhs = list(prod.rhs())
            while nonterm in rhs:
                lhs = prod.lhs()
                rhs.remove(nonterm)
                p = Production(lhs, tuple(rhs))
                new_prods.append(p)
            
        
    productions += new_prods
    productions[:] = [p for p in productions if p.rhs()]
    return productions
Example #15
0
    def induce_structure(self, sentences):

        sentences = [[c for c in s] for s in sentences]

        start_symbols = set()
        productions = []
        prod_table = {}

        # group all digits together
        digit_terminals = set([str(i) for i in range(10)])

        # unary rules
        terminals = set()
        for s in sentences:
            terminals.update(s)
        for t in terminals:
            if t in digit_terminals:
                nt = nltk.Nonterminal("Digit")
            else:
                nt = nltk.Nonterminal("Unary%s" % self.gen_nt())
            p = Production(nt, [t])
            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

        sentences = self.apply_unary_prod(sentences, prod_table)

        while len(sentences) > 0:
            if self.has_recursion(sentences):
                p = self.generate_recursive_prod(sentences)
            else:
                p = self.generate_most_frequent_prod(sentences)

            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

            sentences = self.update_with_prod(sentences, prod_table)

            new_sentences = []
            for s in sentences:
                if len(s) == 1:
                    start_symbols.add(s[0])
                else:
                    new_sentences.append(s)

            sentences = new_sentences

        # generate the start productions
        for symbol in start_symbols:
            for p in productions:
                if p.lhs() == symbol:
                    productions.append(Production(self.start, p.rhs()))

        self.grammar = nltk.induce_pcfg(self.start, productions)
Example #16
0
def convert_unit(grammar):
    '''
    Convert unitary rules in the form of [A -> B] where the rhs has one non-terminal
    by eliminating intermediate unitary rules and promoting the final lexical rule, e.g. [B -> 'b'] => [A -> 'b']
    or stop at an intermediate rule with only non-terminals on the rhs like [B -> C D] => [A -> C D]
    '''

    rules = grammar.productions()
    new_rules = []
    unit_rules = []
    for rule in rules:
        # check for unit rules
        if rule.is_nonlexical() and len(rule) == 1:
            unit_rules.append(rule)
        else:
            new_rules.append(rule)

    # following each unit rule and find the final terminal
    while unit_rules:
        rule = unit_rules.pop(0)
        lhs = rule.lhs()
        rhs = rule.rhs()
        # find rules that can derive the rhs to something else
        for cascade_rule in grammar.productions(lhs=rhs[0]):
            temp_rule = Production(lhs, cascade_rule.rhs())
            if cascade_rule.is_lexical() or len(cascade_rule) > 1:
                new_rules.append(temp_rule)
            else:
                unit_rules.append(temp_rule)

    new_grammar = CFG(grammar.start(), new_rules)

    return new_grammar
Example #17
0
def proper_rule(person: FeatStruct) -> Production:
    """
    :person : feature structure that characterize one person and containt a proper attribute
    :return : the production rule that can generate the propernoun
                ex: "ProperName[proper=Bas] -> "Bas"
    """
    return Production(FeatStructNonterminal("ProperName[proper=%s]" % person["proper"]), [person["proper"]])
Example #18
0
def recursively_replace_lhs(rules, lhs, singles, keep_original):
    assert (all([lhs == p.lhs() for p in singles]))
    assert (all([len(p.rhs()) == 1 for p in singles]))
    out = []
    for r in rules:
        if lhs not in r.rhs():
            out += [r]
        else:
            if keep_original:
                out += [r]
            # find first occurrence
            for loc, t in enumerate(r.rhs()):
                if t == lhs:
                    break
            # substitute first occurrence
            new_rules = [
                Production(
                    r.lhs(),
                    list(r.rhs()[:loc]) + list(s.rhs()) +
                    list(r.rhs()[loc + 1:])) for s in singles
            ]
            out += recursively_replace_lhs(new_rules, lhs, singles,
                                           keep_original)

    return out
Example #19
0
    def test_current_production(self):
        inputs_ = [("""
                (S
                    (sentence
                        (type_1_sentence_coord_1
                        (type_1_sentence_coord_2
                            (type_2_sentence
                            (THERE There)
                            (AUX is)
                            (Noun_Phrase
                                (det (DET an))
                                (Noun_w_support
                                (Adj_phrase
                                    (Adj_core (JJ small))
                                    (AND and)
                                    (Adj_phrase (Adj_core (JJ red))))
                                (Noun_Count (NN apple)))))))
                        (PERIOD .)))
                """, Production(Nonterminal("S"), [Nonterminal("sentence")]))]

        for i, (input_, expect_) in enumerate(inputs_):
            tree = Tree.parse(input_)
            production = current_production(tree)

            self.assertEqual(expect_, production)
Example #20
0
    def parse(self, phrasetokens, cleantree=True, maxtrees=200):
        '''
        :type tokens: builtins.generator
        :return:
        '''
        # check for tokens added by the POS processor -- e.g. ADV
        newprod = False
        # Add a comma and a terminal token to beginning and end of phrase
        COMMA = FGTerminal(',', 'COMMA', phrasetokens[-1].slice.stop)
        COMMA.lexentry = lexicon[(',',)]
        tokens = [FGTerminal('ยข', 'EOP', 0)] + phrasetokens + [COMMA] + [FGTerminal('$', 'EOP', phrasetokens[-1].slice.stop)]

        for tokenindex, fltoken in enumerate(tokens):
            if not self._grammar._lexical_index.get(fltoken.lexword):
                newprod = True
                for lexent in fltoken.lexentry:
                    lexrhs = fltoken.lexword
                    newprod = Production(lexent, (lexrhs,))
                    self._grammar._productions.append(newprod)
        if newprod:
            self._grammar.__init__(self._grammar._start, self._grammar._productions)

        self._chart = self._parser.chart_parse([tk for tk in tokens if tk.POS != 'NULL'])
        # self._chart = self._parser.chart_parse([FGLeaf(tk) for tk in tokens if tk.POS != 'NULL'])
        treegen = self._chart.parses(self._grammar.start(), tree_class=Tree)
        trees = []
        for i, tree in enumerate(treegen):
            if i >= maxtrees:
                break
            if cleantree:
                cleanparsetree(tree)
            if tree not in trees:
                trees.append(tree)
        return trees
Example #21
0
 def productions(self):
     prod = []
     prod.append(Production(Nonterminal(self._label), self.children_name()))
     for i in self._child:
         if isinstance(i, Tree):
             prod.extend(i.productions())
     return prod
def compact_nonterminal(x: str, nont: Nonterminal):
    GCFG = nltk.CFG.fromstring(x)
    prods = GCFG.productions()
    lhs_prods = [p for p in prods if p.lhs() == nont]
    old_prods = [p for p in prods if p not in lhs_prods]

    while True:
        new_prods = []
        for p in old_prods:
            if nont in p.rhs():
                # find first occurrence
                for i, t in enumerate(p.rhs()):
                    if t == nont:
                        break
                # now apply each replacement rule in turn
                for lhsp in lhs_prods:
                    if i < len(p.rhs()) - 1:  # if it's not the last token
                        new_rhs = p.rhs()[:i] + lhsp.rhs() + p.rhs[(i + 1):]
                    else:
                        new_rhs = p.rhs()[:i] + lhsp.rhs()
                    # purge implicit H while we're at it
                    #new_rhs = [x for x in new_rhs if x!="'h'"]
                    this_new_p = Production(p.lhs(), new_rhs)
                    new_prods.append(this_new_p)
            else:
                new_prods.append(p)

        if new_prods == old_prods:
            break
        old_prods = new_prods

    new_str = ''.join([str(p).replace('\\\\', '\\') + '\n' for p in new_prods])
    # print(new_str)
    return new_str
Example #23
0
def process_unit_productions(productions, nonterminal_dict):
    # maintain a set which is same as the production list to speed up the program
    production_set = set(productions)
    need_another_loop = 0
    to_remove_list = []
    to_add_list = []
    for p in productions:
        if len(p.rhs()) == 1 and is_nonterminal(
                p.rhs()[0]):  # A->B, B is non-terminal
            to_remove_list.append(p)
            if p.rhs()[0] not in nonterminal_dict:
                nonterminal_dict[p.rhs()[0]] = [p.lhs()]
                need_another_loop = 1
            elif p.lhs() not in nonterminal_dict[p.rhs()[0]]:
                a = nonterminal_dict[p.rhs()[0]]
                a.append(p.lhs())
                nonterminal_dict[p.rhs()[0]] = a
                need_another_loop = 1
        elif p.lhs() in nonterminal_dict:  # B->C productions
            a = nonterminal_dict[p.lhs()]  # productions with B on the left
            for item in a:  # for every A in A->B
                new_production = Production(item, p.rhs())  # A->C
                if new_production not in production_set:
                    production_set.add(new_production)  # add to the grammar
                    to_add_list.append(new_production)
                    need_another_loop = 1
    return to_add_list, nonterminal_dict, need_another_loop, to_remove_list
Example #24
0
def cover_tree(grammar, tree):
    tree_productions = set(tree.productions())
    gram_productions = []
    pram_prods = grammar.productions()
    for p in pram_prods:
        pram_prods.append(Production(p.lhs(), p.rhs()))
    gram_productions = set(pram_prods)
    return tree_productions.issubset(gram_productions)
Example #25
0
 def add_new_vocab_rule(self, rule):
     """
     Adds a new vocabulary rule to the set of rules, and
     recreates self.cfg and self.parser.
     """
     self.rules.append(Production(NT(rule[0]), rule[1]))
     self.cfg = ContextFreeGrammar(NT("S"), self.rules)
     self.parser = EarleyChartParser(self.cfg, trace=0)
Example #26
0
def literal_production(key, rhs):
    """ Return a production <key> -> n 

    :param key: symbol for lhs:
    :param rhs: string literal:
    """
    lhs = Nonterminal(key)
    return Production(lhs, [rhs])
Example #27
0
    def test_parse_production(self):
        inputs_ = [
            ("PP -> P NP",
             Production(Nonterminal("PP"),
                        [Nonterminal("P"), Nonterminal("NP")])),
            ("S -> NP VP",
             Production(
                 Nonterminal("S"),
                 [Nonterminal("NP"), Nonterminal("VP")])),
            ("THERE -> 'There'", Production(Nonterminal("THERE"), ['There']))
        ]
        for i, (input_, expect_) in enumerate(inputs_):
            production = parse_production(input_)

            error_msg = "Sentence {}-th -- '{}' -- Expect result: {} / Actual result: {}".format(
                i, input_, expect_, production)
            self.assertEqual(expect_, production, error_msg)
Example #28
0
def simple_rule(r):
    left = simple_nonterminal(r.lhs())
    if r.is_nonlexical():
        right = []
        for rh in r.rhs():
            right.append(simple_nonterminal(rh))
    else:
        right = r.rhs()
    return Production(left, right)
Example #29
0
def fail_demo():
    """
    Demo grammar that should not work with backtracking for all inputs
    """
    from nltk.grammar import Nonterminal, Production, ContextFreeGrammar
    S = Nonterminal('S')
    A = Nonterminal('A')
    productions = (
        Production(S, [ A, S, A ]),
        Production(S, [ A, A ]),
        Production(A, [ 'a' ]),
        )
    grammar = ContextFreeGrammar(S, productions)

    text = "a a a a a a".split()
    #text = "a a a a".split()

    RecursiveDescentApp(grammar, text).mainloop()
def purge_implicit_h(x):
    GCFG = nltk.CFG.fromstring(x)
    old_prods = GCFG.productions()
    new_prods = []
    for p in old_prods:
        new_prods.append(Production(p.lhs(), [x for x in p.rhs() if x != 'h']))

    new_str = ''.join([str(p).replace('\\\\', '\\') + '\n' for p in new_prods])
    # print(new_str)
    return new_str
Example #31
0
def fix_parse_production(line, nonterm_parser, probabilistic=False):
    """
    Parse a grammar rule, given as a string, and return
    a list of productions.
    """
    pos = 0

    # Parse the left-hand side.
    lhs, pos = nonterm_parser(line, pos)

    # Skip over the arrow.
    m = _ARROW_RE.match(line, pos)
    if not m: raise ValueError('Expected an arrow')
    pos = m.end()

    # Parse the right hand side.
    probabilities = [0.0]
    rhsides = [[]]
    while pos < len(line):
        # Probability.
        m = _PROBABILITY_RE.match(line, pos)
        if probabilistic and m:
            pos = m.end()
            probabilities[-1] = float(m.group(1)[1:-1])
            if probabilities[-1] > 1.0:
                raise ValueError('Production probability %f, '
                                 'should not be greater than 1.0' %
                                 (probabilities[-1], ))

        # String -- add terminal.
        elif (line[pos] in "\'\"" or line[pos:pos + 2] in ('u"', "u'")):
            m = _TERMINAL_RE.match(line, pos)
            if not m: raise ValueError('Unterminated string')
            rhsides[-1].append(eval(m.group(1)))
            pos = m.end()

        # Vertical bar -- start new rhside.
        elif line[pos] == '|':
            m = _DISJUNCTION_RE.match(line, pos)
            probabilities.append(0.0)
            rhsides.append([])
            pos = m.end()

        # Anything else -- nonterminal.
        else:
            nonterm, pos = nonterm_parser(line, pos)
            rhsides[-1].append(nonterm)

    if probabilistic:
        return [
            FixPP(lhs, rhs, prob=probability)
            for (rhs, probability) in zip(rhsides, probabilities)
        ]
    else:
        return [Production(lhs, rhs) for rhs in rhsides]