def train():
    print("Collecting sub-corpus from Penn Treebank (nltk.corpus)")
    
    # prepare parsing trees, extrated from treebank
    tbank_trees = []
    for sent in treebank.parsed_sents():
        sent.chomsky_normal_form()
        tbank_trees.append(sent)
    
    # build vocabulary list, extracted from treebank
    vocab_size = 10000 # set vocabulary size to 10000
    words = [wrd.lower() for wrd in treebank.words()]
    vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)]
    
    # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency
    tbank_productions = set(production for tree in tbank_trees for production in tree.productions())
    tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))
    production_rules = tbank_grammar.productions()
    rules_to_prob = defaultdict(int)
    nonterm_occurrence = defaultdict(int)
    
    #calculate probablity for rules
    for sent in tbank_trees:
        for production in sent.productions():
            if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal):
                production = Production(production.lhs(), [production.rhs()[0].lower()])
            nonterm_occurrence[production.lhs()] += 1
            rules_to_prob[production] += 1
    for rule in rules_to_prob:
        rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()]

    # use Katz smoothing
    rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab)
    rules = list(rules_to_prob.keys())
    rules_reverse_dict = dict((j,i) for i, j in enumerate(rules))
    left_rules = defaultdict(set)
    right_rules = defaultdict(set)
    unary_rules = defaultdict(set)
    
    # classify left, right rules
    for rule in rules:
        if len(rule.rhs()) > 1:
            left_rules[rule.rhs()[0]].add(rule)
            right_rules[rule.rhs()[1]].add(rule)
        else:
            unary_rules[rule.rhs()[0]].add(rule)
    terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str))
    terminal_nonterms = defaultdict(int)
    for rule in terminal_nonterms_rules:
        terminal_nonterms[rule.lhs()] += 1
        pcfg_parser = {
    'vocab': vocab,
        'left_rules': left_rules,
        'right_rules': right_rules,
        'unary_rules': unary_rules,
        'rules_to_prob': rules_to_prob,
        'terminal_nonterms': terminal_nonterms
    }
    return pcfg_parser
def find_lhs_lexical_rule(word: str, grammar: CFG) -> Nonterminal:
    """
    Finds the LHS of a lexical rule contained in the input CFG grammar.

    :param word: the RHS of a lexical rule
    :param grammar: input CFG grammar
    :return: the LHS of a lexical rule, if it exists
    """
    lexical_rules = list((prod for prod in grammar.productions()
                         if len(prod.rhs()) == 1 and prod.rhs()[0] == word))
    if lexical_rules:
        return lexical_rules[0].lhs()
Ejemplo n.º 3
0
def cky_parsing(words: list, grammar: CFG, draw=False):
    n = len(words) + 1
    table = numpy.ndarray(shape=(n, n), dtype=set)

    for j in range(1, n):  # Looping over the columns
        table[j - 1, j] = list(
            map(
                lambda rule: Tree(rule.lhs(), [words[j - 1]]
                                  ),  # Filling the bottom cell
                list(
                    filter(
                        lambda production: len(production.rhs()) == 1 and
                        production.rhs()[0] == words[j - 1],
                        grammar.productions()))))

        for i in reversed(range(0, j - 1)):
            for k in range(
                    i + 1, j
            ):  # Looping over the possible split locations between i and j
                rule = list(
                    map(
                        lambda rule: Tree(rule.lhs(),
                                          [table[i, k][0], table[k, j][0]]),
                        list(
                            filter(
                                lambda production: len(production.rhs()) == 2
                                and table[i, k] is not None and production.rhs(
                                )[0] in map(lambda head: head.label(), table[
                                    i, k]) and table[k, j] is not None and
                                production.rhs()[1] in map(
                                    lambda head: head.label(), table[k, j]),
                                grammar.productions()))))
                table[i,
                      j] = rule if table[i, j] is None else rule + table[i, j]

    if draw and len(table[0, n - 1]) != 0: table[0, n - 1][0].draw()
    return table[0, n - 1][0] if (
        len(table[0, n - 1]) != 0
        and table[0, n - 1][0].label() == Nonterminal("S")) else Tree(
            "Grammar error", [])
def find_lhs_grammar_rule(first: Tree, second: Tree, grammar: CFG) -> Nonterminal:
    """
    Finds the LHS of a grammar rule contained in the input CFG grammar.

    :param first: first half of the grammar rule's RHS
    :param second: latter half of the grammar rule's RHS
    :param grammar: input CFG grammar
    :return: the LHS of a grammar rule, if it exists
    """
    grammar_rules = list((prod for prod in grammar.productions()
                          if len(prod.rhs()) == 2
                          and first.label() == prod.rhs()[0] and second.label() == prod.rhs()[1]))

    if grammar_rules:
        return grammar_rules[0].lhs()
Ejemplo n.º 5
0
def make_prod_map(cfg: nltk.CFG):
    ''' Assigns an index to each production, in the form: {'<production>': <index>}. '''
    prod_map = {}
    for i, prod in enumerate(cfg.productions()):
        prod_map[prod] = i
    return prod_map
Ejemplo n.º 6
0
f = open('cnf_grammar.pkl', 'wb')
pickle.dump(cnf_, f)
f.close()

# In[26]:

#  Check CNF

print(cnf_.is_chomsky_normal_form())

# In[27]:

# CNF Rules

fin = list(cnf_.productions())
fin.sort(key=left_)

# In[28]:

f = open('atis_cnf.pkl', 'wb')
pickle.dump(fin, f)
f.close()

# In[29]:

f = open('atis_cnf.pkl', 'rb')
a = pickle.load(f)
f.close()

for i in a: