def train(): print("Collecting sub-corpus from Penn Treebank (nltk.corpus)") # prepare parsing trees, extrated from treebank tbank_trees = [] for sent in treebank.parsed_sents(): sent.chomsky_normal_form() tbank_trees.append(sent) # build vocabulary list, extracted from treebank vocab_size = 10000 # set vocabulary size to 10000 words = [wrd.lower() for wrd in treebank.words()] vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)] # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency tbank_productions = set(production for tree in tbank_trees for production in tree.productions()) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) production_rules = tbank_grammar.productions() rules_to_prob = defaultdict(int) nonterm_occurrence = defaultdict(int) #calculate probablity for rules for sent in tbank_trees: for production in sent.productions(): if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal): production = Production(production.lhs(), [production.rhs()[0].lower()]) nonterm_occurrence[production.lhs()] += 1 rules_to_prob[production] += 1 for rule in rules_to_prob: rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()] # use Katz smoothing rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab) rules = list(rules_to_prob.keys()) rules_reverse_dict = dict((j,i) for i, j in enumerate(rules)) left_rules = defaultdict(set) right_rules = defaultdict(set) unary_rules = defaultdict(set) # classify left, right rules for rule in rules: if len(rule.rhs()) > 1: left_rules[rule.rhs()[0]].add(rule) right_rules[rule.rhs()[1]].add(rule) else: unary_rules[rule.rhs()[0]].add(rule) terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str)) terminal_nonterms = defaultdict(int) for rule in terminal_nonterms_rules: terminal_nonterms[rule.lhs()] += 1 pcfg_parser = { 'vocab': vocab, 'left_rules': left_rules, 'right_rules': right_rules, 'unary_rules': unary_rules, 'rules_to_prob': rules_to_prob, 'terminal_nonterms': terminal_nonterms } return pcfg_parser
def find_lhs_lexical_rule(word: str, grammar: CFG) -> Nonterminal: """ Finds the LHS of a lexical rule contained in the input CFG grammar. :param word: the RHS of a lexical rule :param grammar: input CFG grammar :return: the LHS of a lexical rule, if it exists """ lexical_rules = list((prod for prod in grammar.productions() if len(prod.rhs()) == 1 and prod.rhs()[0] == word)) if lexical_rules: return lexical_rules[0].lhs()
def cky_parsing(words: list, grammar: CFG, draw=False): n = len(words) + 1 table = numpy.ndarray(shape=(n, n), dtype=set) for j in range(1, n): # Looping over the columns table[j - 1, j] = list( map( lambda rule: Tree(rule.lhs(), [words[j - 1]] ), # Filling the bottom cell list( filter( lambda production: len(production.rhs()) == 1 and production.rhs()[0] == words[j - 1], grammar.productions())))) for i in reversed(range(0, j - 1)): for k in range( i + 1, j ): # Looping over the possible split locations between i and j rule = list( map( lambda rule: Tree(rule.lhs(), [table[i, k][0], table[k, j][0]]), list( filter( lambda production: len(production.rhs()) == 2 and table[i, k] is not None and production.rhs( )[0] in map(lambda head: head.label(), table[ i, k]) and table[k, j] is not None and production.rhs()[1] in map( lambda head: head.label(), table[k, j]), grammar.productions())))) table[i, j] = rule if table[i, j] is None else rule + table[i, j] if draw and len(table[0, n - 1]) != 0: table[0, n - 1][0].draw() return table[0, n - 1][0] if ( len(table[0, n - 1]) != 0 and table[0, n - 1][0].label() == Nonterminal("S")) else Tree( "Grammar error", [])
def find_lhs_grammar_rule(first: Tree, second: Tree, grammar: CFG) -> Nonterminal: """ Finds the LHS of a grammar rule contained in the input CFG grammar. :param first: first half of the grammar rule's RHS :param second: latter half of the grammar rule's RHS :param grammar: input CFG grammar :return: the LHS of a grammar rule, if it exists """ grammar_rules = list((prod for prod in grammar.productions() if len(prod.rhs()) == 2 and first.label() == prod.rhs()[0] and second.label() == prod.rhs()[1])) if grammar_rules: return grammar_rules[0].lhs()
def make_prod_map(cfg: nltk.CFG): ''' Assigns an index to each production, in the form: {'<production>': <index>}. ''' prod_map = {} for i, prod in enumerate(cfg.productions()): prod_map[prod] = i return prod_map
f = open('cnf_grammar.pkl', 'wb') pickle.dump(cnf_, f) f.close() # In[26]: # Check CNF print(cnf_.is_chomsky_normal_form()) # In[27]: # CNF Rules fin = list(cnf_.productions()) fin.sort(key=left_) # In[28]: f = open('atis_cnf.pkl', 'wb') pickle.dump(fin, f) f.close() # In[29]: f = open('atis_cnf.pkl', 'rb') a = pickle.load(f) f.close() for i in a: