Example #1
0
    def test_horz_markov_0(self):
        t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))")

        model = UPCFG([t], horzMarkov=0)

        prods = model.productions()

        prods2 = [
            # the right-binarized productions:
            ProbabilisticProduction(N('NP'), [N('Det'), N('NP|<>')], prob=1.0),
            ProbabilisticProduction(N('NP|<>'),
                                    [N('Noun'), N('Adj')],
                                    prob=1.0),
            ProbabilisticProduction(N('Det'), ['Det'], prob=1.0),
            ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0),
            ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0),
        ]

        self.assertEqual(set(prods), set(prods2))
Example #2
0
    def test_horz_markov_None(self):
        t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))")

        # Bugfix from official test (, start='NP')
        model = UPCFG([t], start='NP')  # horzMarkov=None by default

        prods = model.productions()

        prods2 = [
            # the right-binarized productions:
            ProbabilisticProduction(N('NP'),
                                    [N('Det'), N('NP|<Noun-Adj>')],
                                    prob=1.0),
            ProbabilisticProduction(N('NP|<Noun-Adj>'),
                                    [N('Noun'), N('Adj')],
                                    prob=1.0),
            ProbabilisticProduction(N('Det'), ['Det'], prob=1.0),
            ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0),
            ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0),
        ]

        self.assertEqual(set(prods), set(prods2))
Example #3
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # Non-Terminal start symbol of the pcfg.
        # Be aware that the start symbol now is specified by the init parameter
        # 'start', and not the start label of the trees in parsed_sents
        self.start = N(start)
        self.horzMarkov = horzMarkov
        # saving repeated productions (for induce probabilities)
        productions = []
        for t in parsed_sents:
            unlex_t = unlexicalize(t.copy(deep=True))
            # Set node label
            unlex_t.set_label(start)
            unlex_t.chomsky_normal_form(horzMarkov=horzMarkov)
            # Not collapsing the Root (collapseRoot=False)
            unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True)
            productions += unlex_t.productions()

        self.pcfg = induce_pcfg(self.start, productions)
        self._probabilistic_productions = self.pcfg.productions()
        self._parser = CKYParser(self.pcfg)
Example #4
0
    def test_productions(self):
        t = Tree.fromstring("""
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)

        # Bugfix from official test (, start='S')
        model = UPCFG([t], start='S')

        prods = model.productions()

        prods2 = [
            ProbabilisticProduction(N('S'), [N('NP'), N('VP')], prob=1.0),
            ProbabilisticProduction(N('NP'), [N('Det'), N('Noun')], prob=0.5),
            ProbabilisticProduction(N('Det'), ['Det'], prob=1.0),
            ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0),
            ProbabilisticProduction(N('VP'), [N('Verb'), N('NP')], prob=1.0),
            ProbabilisticProduction(N('Verb'), ['Verb'], prob=1.0),
            ProbabilisticProduction(N('NP'), [N('Noun'), N('Adj')], prob=0.5),
            ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0),
        ]

        self.assertEqual(set(prods), set(prods2))