def train(): print("Collecting sub-corpus from Penn Treebank (nltk.corpus)") # prepare parsing trees, extrated from treebank tbank_trees = [] for sent in treebank.parsed_sents(): sent.chomsky_normal_form() tbank_trees.append(sent) # build vocabulary list, extracted from treebank vocab_size = 10000 # set vocabulary size to 10000 words = [wrd.lower() for wrd in treebank.words()] vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)] # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency tbank_productions = set(production for tree in tbank_trees for production in tree.productions()) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) production_rules = tbank_grammar.productions() rules_to_prob = defaultdict(int) nonterm_occurrence = defaultdict(int) #calculate probablity for rules for sent in tbank_trees: for production in sent.productions(): if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal): production = Production(production.lhs(), [production.rhs()[0].lower()]) nonterm_occurrence[production.lhs()] += 1 rules_to_prob[production] += 1 for rule in rules_to_prob: rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()] # use Katz smoothing rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab) rules = list(rules_to_prob.keys()) rules_reverse_dict = dict((j,i) for i, j in enumerate(rules)) left_rules = defaultdict(set) right_rules = defaultdict(set) unary_rules = defaultdict(set) # classify left, right rules for rule in rules: if len(rule.rhs()) > 1: left_rules[rule.rhs()[0]].add(rule) right_rules[rule.rhs()[1]].add(rule) else: unary_rules[rule.rhs()[0]].add(rule) terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str)) terminal_nonterms = defaultdict(int) for rule in terminal_nonterms_rules: terminal_nonterms[rule.lhs()] += 1 pcfg_parser = { 'vocab': vocab, 'left_rules': left_rules, 'right_rules': right_rules, 'unary_rules': unary_rules, 'rules_to_prob': rules_to_prob, 'terminal_nonterms': terminal_nonterms } return pcfg_parser
def update_dictionary(lhs_dict, whole_dict, node): production = Production(Nonterminal(node.label()), get_child_names(node)) if production.lhs() not in lhs_dict: lhs_dict[production.lhs()] = 0 if production not in whole_dict: whole_dict[production] = 0 lhs_dict[production.lhs()] += 1 whole_dict[production] += 1
def induce_structure(self, sentences): sentences = [[c for c in s] for s in sentences] start_symbols = set() productions = [] prod_table = {} # group all digits together digit_terminals = set([str(i) for i in range(10)]) # unary rules terminals = set() for s in sentences: terminals.update(s) for t in terminals: if t in digit_terminals: nt = nltk.Nonterminal("Digit") else: nt = nltk.Nonterminal("Unary%s" % self.gen_nt()) p = Production(nt, [t]) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.apply_unary_prod(sentences, prod_table) while len(sentences) > 0: if self.has_recursion(sentences): p = self.generate_recursive_prod(sentences) else: p = self.generate_most_frequent_prod(sentences) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.update_with_prod(sentences, prod_table) new_sentences = [] for s in sentences: if len(s) == 1: start_symbols.add(s[0]) else: new_sentences.append(s) sentences = new_sentences # generate the start productions for symbol in start_symbols: for p in productions: if p.lhs() == symbol: productions.append(Production(self.start, p.rhs())) self.grammar = nltk.induce_pcfg(self.start, productions)
def cky_parser(tokens, left_rules, right_rules, unary_rules, rules_to_prob, vocab, terminal_nonterms, backoff='UNK'): M = [[{} for _ in range(len(tokens)+1)] for _ in range(len(tokens)+1)] for l in range(1, len(tokens) + 1): for i in range(len(tokens) - l + 1): ts = tokens[i:l+i] print("Processing: ", ts) cur_prod_dict = defaultdict(dict) if l == 1: if tokens[i] in unary_rules and len(unary_rules[tokens[i]]) > 0: for rule in unary_rules[tokens[i].lower()]: cur_prod_dict[rule.lhs()] = { 'rule': rule, 'score': np.log(rules_to_prob[rule]), 'back': tokens[i], 'back_type': 'terminal' } elif backoff == 'UNK': for rule in unary_rules['UNK']: cur_prod_dict[rule.lhs()] = { 'rule': Production(rule.lhs(), [tokens[i]]), 'score': np.log(rules_to_prob[rule]), 'back': tokens[i], 'back_type': 'terminal' } elif backoff == 'EQL': for nonterm in terminal_nonterms: rule = Production(nonterm, [tokens[i]]) cur_prod_dict[rule.lhs()] = { 'rule': rule, 'score': np.log(1.0/len(terminal_nonterms)), 'back': tokens[i], 'back_type': 'terminal'} for s in range(i+1, i+l): left_set = list(M[i][s].keys()) right_set = list(M[s][i+l].keys()) for left in left_set: prodsl = left_rules[left] for right in right_set: prodsr = right_rules[right].intersection(prodsl) for rule in prodsr: P = np.log(rules_to_prob[rule]) nscore = P + M[i][s][left]['score'] + M[s][i+l][right]['score'] if rule.lhs() not in cur_prod_dict or nscore > cur_prod_dict[rule.lhs()]['score']: cur_prod_dict[rule.lhs()]['rule'] = rule cur_prod_dict[rule.lhs()]['score'] = nscore cur_prod_dict[rule.lhs()]['back'] = [left, right, s] cur_prod_dict[rule.lhs()]['back_type'] = 'binary_split' M[i][i+l] = handle_unary(cur_prod_dict, unary_rules, rules_to_prob) if len(M[i][i+l]) == 0 and l == 1: print("Failed to generate any productions for '%s' substring" % (' '.join(ts))) return M #print("M[%d][%d] = " % (i, i+l), M[i][i+l]) return M