def test(grammarText, sentences): """Test the coverage of a CFG grammar. grammarText -- the grammar string sentences -- a list of sentences to test, with invalid ones prefixed with '*' """ valid_sentences = [s for s in sentences if s[0] != '*'] invalid_sentences = [s[1:] for s in sentences if s[0] == '*'] parser = ChartParser(parse_cfg(grammarText)) for sentence in valid_sentences: parses = parser.nbest_parse(sentence.split()) print(sentence + "\n" + "\n".join(map(str, parses)) + "\n") assert parses, "Valid sentence failed to parse." for sentence in invalid_sentences: print("*" + sentence) parses = parser.nbest_parse(sentence.split()) assert parses == [], "Invalid sentence parsed successfully."
def parse(sentence, cfg): grammar = parse_cfg(cfg) parser = ChartParser(grammar, TD_STRATEGY) words = sentence.split() draw.draw_trees(*parser.get_parse_list(words))
def guess(self, verbose=None): """ Makes a guess based on the next observation. Updates self._curr_guess. :rtype: CFG :returns: The next guess """ if verbose is not None: self._verbose = verbose sentence = Sentence(next(self._text)) self._num_steps += 1 self._log("String {}: {}".format(self._num_steps, sentence)) if sentence in self._data: self._log("String already seen") return self._curr_guess # Info from previous guess num_contexts = len(self._contexts) num_subs = len(self._substrings) if self._curr_guess is not None: num_nts = len(set(p.lhs() for p in self._curr_guess.productions())) - 1 else: num_nts = 0 total_timer = Timer() total_timer.start() # Update data and terminals words = sentence.get_words() self._data.add(sentence) self._terminals.update(set(words)) # Update contexts self._log("Updating contexts...") inds = range(0, len(words) + 1) contexts = [ Context(words[:i], words[j:]) for i in inds for j in inds[i:] ] self._contexts.update(ContextSet(contexts)) self._log( "{} new contexts added".format(len(self._contexts) - num_contexts)) # Update substrings self._log("Updating substrings...") is_new_sentence = True if self._curr_guess_parser is not None: try: parses = self._curr_guess_parser.parse(words) is_new_sentence = len(list(parses)) == 0 except: is_new_sentence = True if is_new_sentence: subs = [Sentence(words[i:j]) for i in inds for j in inds[i:]] self._substrings.update(SentenceSet(subs)) self._log("{} new substrings added".format( len(self._substrings) - num_subs)) else: self._log("Sentence already generated by current guess") # Construct the nonterminals self._log("Constructing nonterminals...") kernels = set() for i in range(1, self._k + 1): subsets = [ SentenceSet(j) for j in combinations(self._substrings, i) ] kernels.update(subsets) for kernel in kernels: if kernel not in self._nonterminals: nt_name = self._new_name() contexts = self._oracle.restr_right_triangle( kernel, self._contexts) nt = Nonterminal(nt_name) self._nonterminals[kernel] = nt self._nt_contexts[nt] = contexts # Get a set of nonterminals with unique contexts self._log("Removing equivalent nonterminals...") context_nts = {con: nt for nt, con in self._nt_contexts.iteritems()} self._log( "{} nonterminals removed".format(len(kernels) - len(context_nts))) self._log("{} new nonterminals constructed".format( len(context_nts) - num_nts)) # Construct the rules self._log("Constructing rules...") self._productions = set() timer = Timer() # Lexical rules timer.start() for t in self._terminals: t_kernel = SentenceSet([Sentence([t])]) t_nt = self._nonterminals[t_kernel] t_contexts = self._nt_contexts[t_nt] for contexts, nt in context_nts.iteritems(): rule = Production(nt, [t]) if rule in self._productions: continue if rule in self._eliminated_rules: continue if contexts.issubset(t_contexts): self._productions.add(rule) else: self._eliminated_rules.add(rule) timer.stop() num_lex = len(self._productions) self._log("{} lexical rules ({:.2f} secs)".format( num_lex, timer.elapsed())) # Binary rules timer.reset() timer.start() for kernel_l in self._nonterminals: for kernel_r in self._nonterminals: kernel_rhs = kernel_l + kernel_r sents_rhs = list(kernel_rhs.intersection(self._substrings)) inds = range(len(sents_rhs) / self._k + 1) kers_rhs = [ sents_rhs[self._k * i:self._k * (i + 1)] for i in inds ] kers_rhs = [SentenceSet(k) for k in kers_rhs if len(k) > 0] nts_rhs = [self._nonterminals[k] for k in kers_rhs] contexts_nts_rhs = [self._nt_contexts[nt] for nt in nts_rhs] if len(contexts_nts_rhs) > 0: contexts_rhs = contexts_nts_rhs[0].intersection( *contexts_nts_rhs) else: contexts_rhs = self._contexts # Membership queries new_strs_rhs = kernel_rhs.difference(SentenceSet(sents_rhs)) new_contexts_rhs = self._oracle.restr_right_triangle( new_strs_rhs, contexts_rhs) contexts_rhs.intersection_update(new_contexts_rhs) # Building the rules for contexts, nt in context_nts.iteritems(): nt_l = context_nts[self._nt_contexts[ self._nonterminals[kernel_l]]] nt_r = context_nts[self._nt_contexts[ self._nonterminals[kernel_r]]] rule = Production(nt, [nt_l, nt_r]) if rule in self._productions: continue if rule in self._eliminated_rules: continue if contexts.issubset(contexts_rhs): self._productions.add(rule) else: self._eliminated_rules.add(rule) timer.stop() num_bin = len(self._productions) - num_lex self._log("{} binary rules ({:.2f} secs)".format( num_bin, timer.elapsed())) # Start rules timer.reset() timer.start() for contexts, nt in context_nts.iteritems(): rule = Production(self._start_symbol, [nt]) if rule in self._productions: continue if rule in self._eliminated_rules: continue if Context([], []) in contexts: self._productions.add(rule) else: self._eliminated_rules.add(rule) timer.stop() num_start = len(self._productions) - num_lex - num_bin self._log("{} start rules ({:.2f} secs)".format( num_start, timer.elapsed())) # Construct the grammar self._curr_guess = CFG(self._start_symbol, self._productions) self._curr_guess_parser = ChartParser(self._curr_guess) total_timer.stop() elapsed = total_timer.elapsed() num_rules = len(self._curr_guess.productions()) self._log("Constructed grammar with {} rules ({:.2f} secs)".format( num_rules, elapsed)) return self._curr_guess