def train():
    print("Collecting sub-corpus from Penn Treebank (nltk.corpus)")
    
    # prepare parsing trees, extrated from treebank
    tbank_trees = []
    for sent in treebank.parsed_sents():
        sent.chomsky_normal_form()
        tbank_trees.append(sent)
    
    # build vocabulary list, extracted from treebank
    vocab_size = 10000 # set vocabulary size to 10000
    words = [wrd.lower() for wrd in treebank.words()]
    vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)]
    
    # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency
    tbank_productions = set(production for tree in tbank_trees for production in tree.productions())
    tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))
    production_rules = tbank_grammar.productions()
    rules_to_prob = defaultdict(int)
    nonterm_occurrence = defaultdict(int)
    
    #calculate probablity for rules
    for sent in tbank_trees:
        for production in sent.productions():
            if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal):
                production = Production(production.lhs(), [production.rhs()[0].lower()])
            nonterm_occurrence[production.lhs()] += 1
            rules_to_prob[production] += 1
    for rule in rules_to_prob:
        rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()]

    # use Katz smoothing
    rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab)
    rules = list(rules_to_prob.keys())
    rules_reverse_dict = dict((j,i) for i, j in enumerate(rules))
    left_rules = defaultdict(set)
    right_rules = defaultdict(set)
    unary_rules = defaultdict(set)
    
    # classify left, right rules
    for rule in rules:
        if len(rule.rhs()) > 1:
            left_rules[rule.rhs()[0]].add(rule)
            right_rules[rule.rhs()[1]].add(rule)
        else:
            unary_rules[rule.rhs()[0]].add(rule)
    terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str))
    terminal_nonterms = defaultdict(int)
    for rule in terminal_nonterms_rules:
        terminal_nonterms[rule.lhs()] += 1
        pcfg_parser = {
    'vocab': vocab,
        'left_rules': left_rules,
        'right_rules': right_rules,
        'unary_rules': unary_rules,
        'rules_to_prob': rules_to_prob,
        'terminal_nonterms': terminal_nonterms
    }
    return pcfg_parser
コード例 #2
0
def update_dictionary(lhs_dict, whole_dict, node):
    production = Production(Nonterminal(node.label()), get_child_names(node))
    if production.lhs() not in lhs_dict:
        lhs_dict[production.lhs()] = 0
    if production not in whole_dict:
        whole_dict[production] = 0

    lhs_dict[production.lhs()] += 1
    whole_dict[production] += 1
コード例 #3
0
ファイル: WhatLearner.py プロジェクト: pearlfranz20/AL_Core
    def induce_structure(self, sentences):

        sentences = [[c for c in s] for s in sentences]

        start_symbols = set()
        productions = []
        prod_table = {}

        # group all digits together
        digit_terminals = set([str(i) for i in range(10)])

        # unary rules
        terminals = set()
        for s in sentences:
            terminals.update(s)
        for t in terminals:
            if t in digit_terminals:
                nt = nltk.Nonterminal("Digit")
            else:
                nt = nltk.Nonterminal("Unary%s" % self.gen_nt())
            p = Production(nt, [t])
            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

        sentences = self.apply_unary_prod(sentences, prod_table)

        while len(sentences) > 0:
            if self.has_recursion(sentences):
                p = self.generate_recursive_prod(sentences)
            else:
                p = self.generate_most_frequent_prod(sentences)

            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

            sentences = self.update_with_prod(sentences, prod_table)

            new_sentences = []
            for s in sentences:
                if len(s) == 1:
                    start_symbols.add(s[0])
                else:
                    new_sentences.append(s)

            sentences = new_sentences

        # generate the start productions
        for symbol in start_symbols:
            for p in productions:
                if p.lhs() == symbol:
                    productions.append(Production(self.start, p.rhs()))

        self.grammar = nltk.induce_pcfg(self.start, productions)
def cky_parser(tokens, left_rules, right_rules, unary_rules, rules_to_prob, vocab, terminal_nonterms, backoff='UNK'):
    M = [[{} for _ in range(len(tokens)+1)] for _ in range(len(tokens)+1)]
    for l in range(1, len(tokens) + 1):
        for i in range(len(tokens) - l + 1):
            ts = tokens[i:l+i]
            print("Processing: ", ts)
            cur_prod_dict = defaultdict(dict)
            if l == 1:
                if tokens[i] in unary_rules and len(unary_rules[tokens[i]]) > 0:
                    for rule in unary_rules[tokens[i].lower()]:
                        cur_prod_dict[rule.lhs()] = {
                            'rule': rule,
                            'score': np.log(rules_to_prob[rule]),
                            'back': tokens[i],
                            'back_type': 'terminal'
                        }
                elif backoff == 'UNK':
                    for rule in unary_rules['UNK']:
                        cur_prod_dict[rule.lhs()] = {
                            'rule': Production(rule.lhs(), [tokens[i]]),
                            'score': np.log(rules_to_prob[rule]),
                            'back': tokens[i],
                            'back_type': 'terminal'
                        }
                elif backoff == 'EQL':
                    for nonterm in terminal_nonterms:
                        rule = Production(nonterm, [tokens[i]])
                        cur_prod_dict[rule.lhs()] = {
                            'rule': rule,
                            'score': np.log(1.0/len(terminal_nonterms)),
                            'back': tokens[i],
                            'back_type': 'terminal'}
            for s in range(i+1, i+l):
                left_set = list(M[i][s].keys())
                right_set = list(M[s][i+l].keys())
                for left in left_set:
                    prodsl = left_rules[left]
                    for right in right_set:
                        prodsr = right_rules[right].intersection(prodsl)
                        for rule in prodsr:
                            P = np.log(rules_to_prob[rule])
                            nscore = P + M[i][s][left]['score'] + M[s][i+l][right]['score']
                            if rule.lhs() not in cur_prod_dict or nscore > cur_prod_dict[rule.lhs()]['score']:
                                cur_prod_dict[rule.lhs()]['rule'] = rule
                                cur_prod_dict[rule.lhs()]['score'] = nscore
                                cur_prod_dict[rule.lhs()]['back'] = [left, right, s]
                                cur_prod_dict[rule.lhs()]['back_type'] = 'binary_split'
            M[i][i+l] = handle_unary(cur_prod_dict, unary_rules, rules_to_prob)
            if len(M[i][i+l]) == 0 and l == 1:
                print("Failed to generate any productions for '%s' substring" % (' '.join(ts)))
                return M
            #print("M[%d][%d] = " % (i, i+l), M[i][i+l])
    return M