Esempio n. 1
0
def test(grammarText, sentences):
	"""Test the coverage of a CFG grammar.
	
	grammarText -- the grammar string
	sentences -- a list of sentences to test, with invalid ones prefixed with '*'
	
	"""	
	
	valid_sentences = [s for s in sentences if s[0] != '*']
	invalid_sentences = [s[1:] for s in sentences if s[0] == '*']
	parser = ChartParser(parse_cfg(grammarText))

	for sentence in valid_sentences:
		parses = parser.nbest_parse(sentence.split())
		print(sentence + "\n" + "\n".join(map(str, parses)) + "\n")
		assert parses, "Valid sentence failed to parse."

	for sentence in invalid_sentences:
		print("*" + sentence)
		parses = parser.nbest_parse(sentence.split())
		assert parses == [], "Invalid sentence parsed successfully."
Esempio n. 2
0
def parse(sentence, cfg):
    grammar = parse_cfg(cfg)
    parser = ChartParser(grammar, TD_STRATEGY)
    words = sentence.split()
    draw.draw_trees(*parser.get_parse_list(words))
Esempio n. 3
0
    def guess(self, verbose=None):
        """
        Makes a guess based on the next observation.
        Updates self._curr_guess.

        :rtype: CFG
        :returns: The next guess
        """
        if verbose is not None:
            self._verbose = verbose

        sentence = Sentence(next(self._text))
        self._num_steps += 1
        self._log("String {}: {}".format(self._num_steps, sentence))

        if sentence in self._data:
            self._log("String already seen")
            return self._curr_guess

        # Info from previous guess
        num_contexts = len(self._contexts)
        num_subs = len(self._substrings)
        if self._curr_guess is not None:
            num_nts = len(set(p.lhs()
                              for p in self._curr_guess.productions())) - 1
        else:
            num_nts = 0

        total_timer = Timer()
        total_timer.start()

        # Update data and terminals
        words = sentence.get_words()
        self._data.add(sentence)
        self._terminals.update(set(words))

        # Update contexts
        self._log("Updating contexts...")
        inds = range(0, len(words) + 1)
        contexts = [
            Context(words[:i], words[j:]) for i in inds for j in inds[i:]
        ]
        self._contexts.update(ContextSet(contexts))
        self._log(
            "{} new contexts added".format(len(self._contexts) - num_contexts))

        # Update substrings
        self._log("Updating substrings...")

        is_new_sentence = True
        if self._curr_guess_parser is not None:
            try:
                parses = self._curr_guess_parser.parse(words)
                is_new_sentence = len(list(parses)) == 0
            except:
                is_new_sentence = True

        if is_new_sentence:
            subs = [Sentence(words[i:j]) for i in inds for j in inds[i:]]
            self._substrings.update(SentenceSet(subs))
            self._log("{} new substrings added".format(
                len(self._substrings) - num_subs))
        else:
            self._log("Sentence already generated by current guess")

        # Construct the nonterminals
        self._log("Constructing nonterminals...")

        kernels = set()
        for i in range(1, self._k + 1):
            subsets = [
                SentenceSet(j) for j in combinations(self._substrings, i)
            ]
            kernels.update(subsets)

        for kernel in kernels:
            if kernel not in self._nonterminals:
                nt_name = self._new_name()
                contexts = self._oracle.restr_right_triangle(
                    kernel, self._contexts)
                nt = Nonterminal(nt_name)
                self._nonterminals[kernel] = nt
                self._nt_contexts[nt] = contexts

        # Get a set of nonterminals with unique contexts
        self._log("Removing equivalent nonterminals...")
        context_nts = {con: nt for nt, con in self._nt_contexts.iteritems()}
        self._log(
            "{} nonterminals removed".format(len(kernels) - len(context_nts)))
        self._log("{} new nonterminals constructed".format(
            len(context_nts) - num_nts))

        # Construct the rules
        self._log("Constructing rules...")
        self._productions = set()
        timer = Timer()

        # Lexical rules
        timer.start()
        for t in self._terminals:
            t_kernel = SentenceSet([Sentence([t])])
            t_nt = self._nonterminals[t_kernel]
            t_contexts = self._nt_contexts[t_nt]

            for contexts, nt in context_nts.iteritems():
                rule = Production(nt, [t])
                if rule in self._productions:
                    continue
                if rule in self._eliminated_rules:
                    continue

                if contexts.issubset(t_contexts):
                    self._productions.add(rule)
                else:
                    self._eliminated_rules.add(rule)

        timer.stop()
        num_lex = len(self._productions)
        self._log("{} lexical rules ({:.2f} secs)".format(
            num_lex, timer.elapsed()))

        # Binary rules
        timer.reset()
        timer.start()
        for kernel_l in self._nonterminals:
            for kernel_r in self._nonterminals:
                kernel_rhs = kernel_l + kernel_r
                sents_rhs = list(kernel_rhs.intersection(self._substrings))

                inds = range(len(sents_rhs) / self._k + 1)
                kers_rhs = [
                    sents_rhs[self._k * i:self._k * (i + 1)] for i in inds
                ]
                kers_rhs = [SentenceSet(k) for k in kers_rhs if len(k) > 0]

                nts_rhs = [self._nonterminals[k] for k in kers_rhs]
                contexts_nts_rhs = [self._nt_contexts[nt] for nt in nts_rhs]
                if len(contexts_nts_rhs) > 0:
                    contexts_rhs = contexts_nts_rhs[0].intersection(
                        *contexts_nts_rhs)
                else:
                    contexts_rhs = self._contexts

                # Membership queries
                new_strs_rhs = kernel_rhs.difference(SentenceSet(sents_rhs))
                new_contexts_rhs = self._oracle.restr_right_triangle(
                    new_strs_rhs, contexts_rhs)
                contexts_rhs.intersection_update(new_contexts_rhs)

                # Building the rules
                for contexts, nt in context_nts.iteritems():
                    nt_l = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_l]]]
                    nt_r = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_r]]]
                    rule = Production(nt, [nt_l, nt_r])
                    if rule in self._productions:
                        continue
                    if rule in self._eliminated_rules:
                        continue

                    if contexts.issubset(contexts_rhs):
                        self._productions.add(rule)
                    else:
                        self._eliminated_rules.add(rule)

        timer.stop()
        num_bin = len(self._productions) - num_lex
        self._log("{} binary rules ({:.2f} secs)".format(
            num_bin, timer.elapsed()))

        # Start rules
        timer.reset()
        timer.start()
        for contexts, nt in context_nts.iteritems():
            rule = Production(self._start_symbol, [nt])
            if rule in self._productions:
                continue
            if rule in self._eliminated_rules:
                continue
            if Context([], []) in contexts:
                self._productions.add(rule)
            else:
                self._eliminated_rules.add(rule)

        timer.stop()
        num_start = len(self._productions) - num_lex - num_bin
        self._log("{} start rules ({:.2f} secs)".format(
            num_start, timer.elapsed()))

        # Construct the grammar
        self._curr_guess = CFG(self._start_symbol, self._productions)
        self._curr_guess_parser = ChartParser(self._curr_guess)

        total_timer.stop()
        elapsed = total_timer.elapsed()
        num_rules = len(self._curr_guess.productions())
        self._log("Constructed grammar with {} rules ({:.2f} secs)".format(
            num_rules, elapsed))

        return self._curr_guess