Example #1
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        start -- start symbol.
        horzMarkov -- None for default. A number n >= 0 for horizontal markov.
        """
        self.start = start

        count_Y_Z = defaultdict(lambda: defaultdict(int))
        count_X = defaultdict(int)
        for t in parsed_sents:
            # it's a copy of tree. We don't want to modify the original tree.
            # mutable structures
            unle_trees = unlexicalize(t.copy(deep=True))
            # chomsky normal form with horizontal markov.
            unle_trees.chomsky_normal_form(horzMarkov=horzMarkov)
            # collapse subtrees with a single child.
            unle_trees.collapse_unary(collapsePOS=True)
            for prod in unle_trees.productions():
                count_Y_Z[prod.lhs()][prod.rhs()] += 1
                count_X[prod.lhs()] += 1

        # create a list of productions.
        productions = []
        for X, c_X in count_X.items():
            for (Y_Z, c_Y_Z) in count_Y_Z[X].items():
                q = c_Y_Z / float(c_X)
                productions.append(ProbabilisticProduction(X, Y_Z, prob=q))

        self.production = productions

        grammar = PCFG(Nonterminal(start), productions)
        self.parser = CKYParser(grammar)
Example #2
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # { A -> B : count(A -> B) }
        productions_counts = defaultdict(int)
        # { A : count(A) }
        lhs_count = defaultdict(int)  # left_hand_side_count

        self.start = start  # Para la gramatica del parser CKY
        self.prods = []  # Lista de producciones

        # Hacemos una copia de t porque al hacer el unlexicalize, este me
        # modifica el arbol
        # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents]
        unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents]

        for t in unlex_sents:
            t.chomsky_normal_form(horzMarkov=horzMarkov)
            t.collapse_unary(collapsePOS=True, collapseRoot=True)
            for prod in t.productions():
                # type(prod): <class 'nltk.grammar.Production'>
                # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
                # type(prod.rhs): <class 'tuple'>
                #   Cada elemento de prod.rhs() es del tipo:
                #       <class 'nltk.grammar.Nonterminal'>
                productions_counts[prod] += 1
                lhs_count[prod.lhs()] += 1

        for prod, count_prod in productions_counts.items():
            # type(production): <class 'nltk.grammar.Production'>
                # production : A -> B
                # type(count_prod): int
            # count_prod : count(A -> B)
            count_lhs = lhs_count.get(prod.lhs(), 0)

            # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
            # type(prod.rhs): <class 'tuple'>
            q_ML = float(count_prod) / count_lhs
            self.prods += [ProbabilisticProduction(prod.lhs(),
                                                   prod.rhs(),
                                                   prob=q_ML)]
            # Cada elemento de self.prods es del tipo:
            #     <class 'nltk.grammar.ProbabilisticProduction'>

        # type(PCFG(...)) = <class 'nltk.grammar.PCFG'>
        # PCFG(start, productions)
        #       type(start): Nonterminal
        #       type(productions): list(Production)
        grammar = PCFG(Nonterminal(start), self.prods)
        self.my_parser = CKYParser(grammar)
Example #3
0
class UPCFG:
    """Unlexicalized PCFG.
    """

    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # Non-Terminal start symbol of the pcfg.
        # Be aware that the start symbol now is specified by the init parameter
        # 'start', and not the start label of the trees in parsed_sents
        self.start = N(start)
        self.horzMarkov = horzMarkov
        # saving repeated productions (for induce probabilities)
        productions = []
        for t in parsed_sents:
            unlex_t = unlexicalize(t.copy(deep=True))
            # Set node label
            unlex_t.set_label(start)
            unlex_t.chomsky_normal_form(horzMarkov=horzMarkov)
            # Not collapsing the Root (collapseRoot=False)
            unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True)
            productions += unlex_t.productions()

        self.pcfg = induce_pcfg(self.start, productions)
        self._probabilistic_productions = self.pcfg.productions()
        self._parser = CKYParser(self.pcfg)

    def productions(self):
        """Returns the list of UPCFG probabilistic productions.
        """
        # type: list(nltk.grammar.ProbabilisticProduction)
        return self._probabilistic_productions

    def parse(self, tagged_sent):
        """Parse a tagged sentence.

        tagged_sent -- the tagged sentence (a list of pairs (word, tag)).
        """
        words, tags = zip(*tagged_sent)
        # Unlexicalized tree in CNF
        _, unlex_parse_tree = self._parser.parse(tags)

        if unlex_parse_tree is None:
            # Flat tree
            parse_tree = Tree(self.start.symbol(),
                              [Tree(tag, [word]) for word, tag in tagged_sent])
        else:
            # Undo CNF
            unlex_parse_tree.un_chomsky_normal_form()
            # Add words
            parse_tree = lexicalize(unlex_parse_tree, words)

        return parse_tree
Example #4
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # Non-Terminal start symbol of the pcfg.
        # Be aware that the start symbol now is specified by the init parameter
        # 'start', and not the start label of the trees in parsed_sents
        self.start = N(start)
        self.horzMarkov = horzMarkov
        # saving repeated productions (for induce probabilities)
        productions = []
        for t in parsed_sents:
            unlex_t = unlexicalize(t.copy(deep=True))
            # Set node label
            unlex_t.set_label(start)
            unlex_t.chomsky_normal_form(horzMarkov=horzMarkov)
            # Not collapsing the Root (collapseRoot=False)
            unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True)
            productions += unlex_t.productions()

        self.pcfg = induce_pcfg(self.start, productions)
        self._probabilistic_productions = self.pcfg.productions()
        self._parser = CKYParser(self.pcfg)
Example #5
0
    def test_parse(self):
        grammar = PCFG.fromstring(
            """
                S -> NP VP              [1.0]
                NP -> Det Noun          [0.6]
                NP -> Noun Adj          [0.4]
                VP -> Verb NP           [1.0]
                Det -> 'el'             [1.0]
                Noun -> 'gato'          [0.9]
                Noun -> 'pescado'       [0.1]
                Verb -> 'come'          [1.0]
                Adj -> 'crudo'          [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('el gato come pescado crudo'.split())

        # check chart
        pi = {
            (1, 1): {'Det': log2(1.0)},
            (2, 2): {'Noun': log2(0.9)},
            (3, 3): {'Verb': log2(1.0)},
            (4, 4): {'Noun': log2(0.1)},
            (5, 5): {'Adj': log2(1.0)},

            (1, 2): {'NP': log2(0.6 * 1.0 * 0.9)},
            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': log2(0.4 * 0.1 * 1.0)},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)},

            (1, 4): {},
            (2, 5): {},

            (1, 5): {'S':
                     log2(1.0) +  # rule S -> NP VP
                     log2(0.6 * 1.0 * 0.9) +  # left part
                     log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)},  # right part
        }
        self.assertEqualPi(parser._pi, pi)

        # check partial results
        bp = {
            (1, 1): {'Det': Tree.fromstring("(Det el)")},
            (2, 2): {'Noun': Tree.fromstring("(Noun gato)")},
            (3, 3): {'Verb': Tree.fromstring("(Verb come)")},
            (4, 4): {'Noun': Tree.fromstring("(Noun pescado)")},
            (5, 5): {'Adj': Tree.fromstring("(Adj crudo)")},

            (1, 2): {'NP': Tree.fromstring("(NP (Det el) (Noun gato))")},
            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))")},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': Tree.fromstring(
                "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))")},

            (1, 4): {},
            (2, 5): {},

            (1, 5): {'S': Tree.fromstring(
                """(S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                   )
                """)},
        }
        self.assertEqual(parser._bp, bp)

        # check tree
        t2 = Tree.fromstring(
            """
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)
        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0)
        self.assertAlmostEqual(lp, lp2)
Example #6
0
    def test_parse_2(self):
        grammar = PCFG.fromstring(
            """
                S -> NP VP              [1.0]
                NP -> NP PP             [0.5]
                NP -> Det Noun          [0.5]
                VP -> VP PP             [0.9]
                VP -> Verb NP           [0.1]
                PP -> Prep NP           [1.0]
                Noun -> 'dog'           [0.2]
                Noun -> 'man'           [0.2]
                Noun -> 'town'          [0.6]
                Verb -> 'saw'           [1.0]
                Prep -> 'in'            [1.0]
                Det -> 'the'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog in the town'.split())

        # check chart
        pi = {
                 (1, 1): {'Det': log2(1.0)},
                 (2, 2): {'Noun': log2(0.2)},
                 (3, 3): {'Verb': log2(1.0)},
                 (4, 4): {'Det': log2(1.0)},
                 (5, 5): {'Noun': log2(0.2)},
                 (6, 6): {'Prep': log2(1.0)},
                 (7, 7): {'Det': log2(1.0)},
                 (8, 8): {'Noun': log2(0.6)},

                 (1, 2): {'NP': -3.321928094887362},
                 (2, 3): {},
                 (3, 4): {},
                 (4, 5): {'NP': -3.321928094887362},
                 (5, 6): {},
                 (6, 7): {},
                 (7, 8): {'NP': -1.736965594166206},

                 (1, 3): {},
                 (2, 4): {},
                 (3, 5): {'VP': -6.643856189774724},
                 (4, 6): {},
                 (5, 7): {},
                 (6, 8): {'PP': -1.736965594166206},

                 (1, 4): {},
                 (2, 5): {},
                 (3, 6): {},
                 (4, 7): {},
                 (5, 8): {},

                 (1, 5): {'S': -9.965784284662087},
                 (2, 6): {},
                 (3, 7): {},
                 (4, 8): {'NP': -6.058893689053567},

                 (1, 6): {},
                 (2, 7): {},
                 (3, 8): {'VP': -8.53282487738598},

                 (1, 7): {},
                 (2, 8): {},

                 (1, 8): {'S': -11.854752972273342},

                 }
        self.assertEqualPi(parser._pi, pi)

        bp = {
            (1, 1): {'Det': Tree.fromstring('(Det the)')},
            (2, 2): {'Noun': Tree.fromstring('(Noun man)')},
            (3, 3): {'Verb': Tree.fromstring('(Verb saw)')},
            (4, 4): {'Det': Tree.fromstring('(Det the)')},
            (5, 5): {'Noun': Tree.fromstring('(Noun dog)')},
            (6, 6): {'Prep': Tree.fromstring('(Prep in)')},
            (7, 7): {'Det': Tree.fromstring('(Det the)')},
            (8, 8): {'Noun': Tree.fromstring('(Noun town)')},
            (1, 2): {'NP': Tree.fromstring('(NP (Det the) (Noun man))')},

            (2, 3): {},
            (3, 4): {},
            (4, 5): {'NP': Tree.fromstring('(NP (Det the) (Noun dog))')},
            (5, 6): {},
            (6, 7): {},
            (7, 8): {'NP': Tree.fromstring('(NP (Det the) (Noun town))')},

            (1, 3): {},
            (2, 4): {},
            (3, 5): {'VP': Tree.fromstring(
                '(VP (Verb saw) (NP (Det the) (Noun dog)))')},
            (4, 6): {},
            (5, 7): {},
            (6, 8): {'PP': Tree.fromstring(
                '(PP (Prep in) (NP (Det the) (Noun town)))')},

            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},

            (1, 5): {'S': Tree.fromstring(
                """(S
                      (NP (Det the) (Noun man))
                      (VP (Verb saw) (NP (Det the) (Noun dog))))""")},
            (2, 6): {},
            (3, 7): {},
            (4, 8): {'NP': Tree.fromstring(
                """(NP
                      (NP (Det the) (Noun dog))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")},
            (1, 6): {},
            (2, 7): {},
            (3, 8): {'VP': Tree.fromstring(
                """(VP
                      (VP (Verb saw) (NP (Det the) (Noun dog)))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")},
            (1, 7): {},
            (2, 8): {},

            (1, 8): {'S': Tree.fromstring(
                """(S
                      (NP (Det the) (Noun man))
                      (VP
                        (VP (Verb saw) (NP (Det the) (Noun dog)))
                        (PP (Prep in) (NP (Det the) (Noun town)))))""")},
            }

        self.assertEqual(parser._bp, bp)
Example #7
0
    def test_parse(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                NP -> Det Noun          [0.6]
                NP -> Noun Adj          [0.4]
                VP -> Verb NP           [1.0]
                Det -> 'el'             [1.0]
                Noun -> 'gato'          [0.9]
                Noun -> 'pescado'       [0.1]
                Verb -> 'come'          [1.0]
                Adj -> 'crudo'          [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('el gato come pescado crudo'.split())

        # check chart
        pi = {
            (1, 1): {
                'Det': log2(1.0)
            },
            (2, 2): {
                'Noun': log2(0.9)
            },
            (3, 3): {
                'Verb': log2(1.0)
            },
            (4, 4): {
                'Noun': log2(0.1)
            },
            (5, 5): {
                'Adj': log2(1.0)
            },
            (1, 2): {
                'NP': log2(0.6 * 1.0 * 0.9)
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': log2(0.4 * 0.1 * 1.0)
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)
            },
            (1, 4): {},
            (2, 5): {},
            (1, 5): {
                'S':
                log2(1.0) +  # rule S -> NP VP
                log2(0.6 * 1.0 * 0.9) +  # left part
                log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0)
            },  # right part
        }
        self.assertEqualPi(parser._pi, pi)

        # check partial results
        bp = {
            (1, 1): {
                'Det': Tree.fromstring("(Det el)")
            },
            (2, 2): {
                'Noun': Tree.fromstring("(Noun gato)")
            },
            (3, 3): {
                'Verb': Tree.fromstring("(Verb come)")
            },
            (4, 4): {
                'Noun': Tree.fromstring("(Noun pescado)")
            },
            (5, 5): {
                'Adj': Tree.fromstring("(Adj crudo)")
            },
            (1, 2): {
                'NP': Tree.fromstring("(NP (Det el) (Noun gato))")
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))")
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP':
                Tree.fromstring(
                    "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))")
            },
            (1, 4): {},
            (2, 5): {},
            (1, 5): {
                'S':
                Tree.fromstring("""(S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                   )
                """)
            },
        }
        self.assertEqual(parser._bp, bp)

        # check tree
        t2 = Tree.fromstring("""
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)
        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0)
        self.assertAlmostEqual(lp, lp2)
Example #8
0
    def test_parse_2(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                NP -> NP PP             [0.5]
                NP -> Det Noun          [0.5]
                VP -> VP PP             [0.9]
                VP -> Verb NP           [0.1]
                PP -> Prep NP           [1.0]
                Noun -> 'dog'           [0.2]
                Noun -> 'man'           [0.2]
                Noun -> 'town'          [0.6]
                Verb -> 'saw'           [1.0]
                Prep -> 'in'            [1.0]
                Det -> 'the'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog in the town'.split())

        # check chart
        pi = {
            (1, 1): {
                'Det': log2(1.0)
            },
            (2, 2): {
                'Noun': log2(0.2)
            },
            (3, 3): {
                'Verb': log2(1.0)
            },
            (4, 4): {
                'Det': log2(1.0)
            },
            (5, 5): {
                'Noun': log2(0.2)
            },
            (6, 6): {
                'Prep': log2(1.0)
            },
            (7, 7): {
                'Det': log2(1.0)
            },
            (8, 8): {
                'Noun': log2(0.6)
            },
            (1, 2): {
                'NP': -3.321928094887362
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': -3.321928094887362
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': -1.736965594166206
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': -6.643856189774724
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP': -1.736965594166206
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S': -9.965784284662087
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP': -6.058893689053567
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP': -8.53282487738598
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S': -11.854752972273342
            },
        }
        self.assertEqualPi(parser._pi, pi)

        bp = {
            (1, 1): {
                'Det': Tree.fromstring('(Det the)')
            },
            (2, 2): {
                'Noun': Tree.fromstring('(Noun man)')
            },
            (3, 3): {
                'Verb': Tree.fromstring('(Verb saw)')
            },
            (4, 4): {
                'Det': Tree.fromstring('(Det the)')
            },
            (5, 5): {
                'Noun': Tree.fromstring('(Noun dog)')
            },
            (6, 6): {
                'Prep': Tree.fromstring('(Prep in)')
            },
            (7, 7): {
                'Det': Tree.fromstring('(Det the)')
            },
            (8, 8): {
                'Noun': Tree.fromstring('(Noun town)')
            },
            (1, 2): {
                'NP': Tree.fromstring('(NP (Det the) (Noun man))')
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': Tree.fromstring('(NP (Det the) (Noun dog))')
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': Tree.fromstring('(NP (Det the) (Noun town))')
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP':
                Tree.fromstring('(VP (Verb saw) (NP (Det the) (Noun dog)))')
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP':
                Tree.fromstring('(PP (Prep in) (NP (Det the) (Noun town)))')
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S':
                Tree.fromstring("""(S
                      (NP (Det the) (Noun man))
                      (VP (Verb saw) (NP (Det the) (Noun dog))))""")
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP':
                Tree.fromstring("""(NP
                      (NP (Det the) (Noun dog))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP':
                Tree.fromstring("""(VP
                      (VP (Verb saw) (NP (Det the) (Noun dog)))
                      (PP (Prep in) (NP (Det the) (Noun town))))""")
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S':
                Tree.fromstring("""(S
                      (NP (Det the) (Noun man))
                      (VP
                        (VP (Verb saw) (NP (Det the) (Noun dog)))
                        (PP (Prep in) (NP (Det the) (Noun town)))))""")
            },
        }

        self.assertEqual(parser._bp, bp)
Example #9
0
class UPCFG:
    """
    Unlexicalized PCFG.
    """

    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # { A -> B : count(A -> B) }
        productions_counts = defaultdict(int)
        # { A : count(A) }
        lhs_count = defaultdict(int)  # left_hand_side_count

        self.start = start  # Para la gramatica del parser CKY
        self.prods = []  # Lista de producciones

        # Hacemos una copia de t porque al hacer el unlexicalize, este me
        # modifica el arbol
        # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents]
        unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents]

        for t in unlex_sents:
            t.chomsky_normal_form(horzMarkov=horzMarkov)
            t.collapse_unary(collapsePOS=True, collapseRoot=True)
            for prod in t.productions():
                # type(prod): <class 'nltk.grammar.Production'>
                # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
                # type(prod.rhs): <class 'tuple'>
                #   Cada elemento de prod.rhs() es del tipo:
                #       <class 'nltk.grammar.Nonterminal'>
                productions_counts[prod] += 1
                lhs_count[prod.lhs()] += 1

        for prod, count_prod in productions_counts.items():
            # type(production): <class 'nltk.grammar.Production'>
                # production : A -> B
                # type(count_prod): int
            # count_prod : count(A -> B)
            count_lhs = lhs_count.get(prod.lhs(), 0)

            # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
            # type(prod.rhs): <class 'tuple'>
            q_ML = float(count_prod) / count_lhs
            self.prods += [ProbabilisticProduction(prod.lhs(),
                                                   prod.rhs(),
                                                   prob=q_ML)]
            # Cada elemento de self.prods es del tipo:
            #     <class 'nltk.grammar.ProbabilisticProduction'>

        # type(PCFG(...)) = <class 'nltk.grammar.PCFG'>
        # PCFG(start, productions)
        #       type(start): Nonterminal
        #       type(productions): list(Production)
        grammar = PCFG(Nonterminal(start), self.prods)
        self.my_parser = CKYParser(grammar)

    def productions(self):
        """
        Returns the list of UPCFG probabilistic productions.
        """
        return self.prods

    def parse(self, tagged_sent):
        """
        Parse a tagged sentence.

        tagged_sent -- the tagged sentence (a list of pairs (word, tag)).
        """
        words, tags = zip(*tagged_sent)

        log_probability, tree = self.my_parser.parse(tags)

        # Si no se puede parsear con CKY, entonces devolvemos el Flat
        if log_probability == float("-inf"):
            return Tree(self.start, [Tree(t, [w]) for w, t in tagged_sent])

        tree.un_chomsky_normal_form()

        return lexicalize(tree, words)
Example #10
0
    def test_parse_ambiguity(self):
        # Ejemplo tomado de las paginas 4, 5, 8 de las notas de Michael Collins
        # Probabilistic Context-Free Grammars (PCFGs)
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]

                VP -> Vt NP             [0.65]
                VP -> VP PP             [0.35]

                NP -> DT NN             [0.8]
                NP -> NP PP             [0.2]

                PP -> IN NP             [1.0]

                Vt -> saw               [1.0]

                NN -> man               [0.2]
                NN -> telescope         [0.3]
                NN -> dog               [0.5]

                DT -> the               [1.0]

                IN -> with              [1.0]
            """)

        # Cambiando esto:
        # VP -> Vt NP             [0.85]
        # VP -> VP PP             [0.15]
        # Obtengo el otro arbol

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog with the telescope'.split())

        # draw_trees(t)

        # check tree
        t2 = Tree.fromstring("""
                    (S
                        (NP
                            (DT the)
                            (NN man)
                        )
                        (VP
                            (VP
                                (Vt saw)
                                (NP
                                    (DT the)
                                    (NN dog)
                                )
                            )
                            (PP
                                (IN with)
                                (NP
                                    (DT the)
                                    (NN telescope)
                                )
                            )
                        )
                    )
                """)

        self.assertEqual(t, t2)

        # check log probability
        lp2 = log2(1.0 * 0.8 * 1.0 * 0.2 * 0.35 * 0.65 * 1.0 * 0.8 * 1.0 *
                   0.5 * 1.0 * 1.0 * 0.8 * 1.0 * 0.3)

        self.assertAlmostEqual(lp, lp2)
Example #11
0
    def test_ambiguo(self):
        grammar = PCFG.fromstring("""
                S -> NP VP              [1.0]
                VP -> Vt NP             [0.3]
                VP -> VP PP             [0.7]
                NP -> NP PP             [0.6]
                NP -> DT NN             [0.4]
                PP -> IN NP             [1.0]
                Vt -> 'saw'             [1.0]
                NN -> 'man'             [0.33]
                NN -> 'telescope'       [0.33]
                NN -> 'dog'             [0.34]
                DT -> 'the'             [1.0]
                IN -> 'with'            [1.0]
            """)

        parser = CKYParser(grammar)

        lp, t = parser.parse('the man saw the dog with the telescope'.split())

        pi = {
            (1, 1): {
                'DT': 0.0
            },
            (2, 2): {
                'NN': -1.5994620704162712
            },
            (3, 3): {
                'Vt': 0.0
            },
            (4, 4): {
                'DT': 0.0
            },
            (5, 5): {
                'NN': -1.5563933485243853
            },
            (6, 6): {
                'IN': 0.0
            },
            (7, 7): {
                'DT': 0.0
            },
            (8, 8): {
                'NN': -1.5994620704162712
            },
            (1, 2): {
                'NP': -2.9213901653036336
            },
            (2, 3): {},
            (3, 4): {},
            (4, 5): {
                'NP': -2.8783214434117474
            },
            (5, 6): {},
            (6, 7): {},
            (7, 8): {
                'NP': -2.9213901653036336
            },
            (1, 3): {},
            (2, 4): {},
            (3, 5): {
                'VP': -4.6152870375779536
            },
            (4, 6): {},
            (5, 7): {},
            (6, 8): {
                'PP': -2.9213901653036336
            },
            (1, 4): {},
            (2, 5): {},
            (3, 6): {},
            (4, 7): {},
            (5, 8): {},
            (1, 5): {
                'S': -7.536677202881587
            },
            (2, 6): {},
            (3, 7): {},
            (4, 8): {
                'NP': -6.536677202881587
            },
            (1, 6): {},
            (2, 7): {},
            (3, 8): {
                'VP': -8.051250375711346
            },
            (1, 7): {},
            (2, 8): {},
            (1, 8): {
                'S': -10.972640541014979
            }
        }

        self.assertEqualPi(parser._pi, pi)

        t2 = Tree.fromstring("""
            (S
              (NP (DT the) (NN man))
              (VP
                (VP (Vt saw) (NP (DT the) (NN dog)))
                (PP (IN with) (NP (DT the) (NN telescope)))))
            """)

        self.assertEqual(t, t2)
Example #12
0
class UPCFG:
    """Unlexicalized PCFG.
    """
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        start -- start symbol.
        horzMarkov -- None for default. A number n >= 0 for horizontal markov.
        """
        self.start = start

        count_Y_Z = defaultdict(lambda: defaultdict(int))
        count_X = defaultdict(int)
        for t in parsed_sents:
            # it's a copy of tree. We don't want to modify the original tree.
            # mutable structures
            unle_trees = unlexicalize(t.copy(deep=True))
            # chomsky normal form with horizontal markov.
            unle_trees.chomsky_normal_form(horzMarkov=horzMarkov)
            # collapse subtrees with a single child.
            unle_trees.collapse_unary(collapsePOS=True)
            for prod in unle_trees.productions():
                count_Y_Z[prod.lhs()][prod.rhs()] += 1
                count_X[prod.lhs()] += 1

        # create a list of productions.
        productions = []
        for X, c_X in count_X.items():
            for (Y_Z, c_Y_Z) in count_Y_Z[X].items():
                q = c_Y_Z / float(c_X)
                productions.append(ProbabilisticProduction(X, Y_Z, prob=q))

        self.production = productions

        grammar = PCFG(Nonterminal(start), productions)
        self.parser = CKYParser(grammar)

    def productions(self):
        """Returns the list of UPCFG probabilistic productions.
        """
        return self.production

    def parse(self, tagged_sent):
        """Parse a tagged sentence.
 
        tagged_sent -- the tagged sentence (a list of pairs (word, tag)).
        """
        sent, tags = zip(*tagged_sent)
        prob_sent, tree = self.parser.parse(tags)

        if prob_sent == float('-inf'):
            # flat tree
            return Tree(self.start,
                        [Tree(tag, [word]) for word, tag in tagged_sent])

        # because we want the unchomsky normal form
        # cky's tree is in chomsky normal form.
        tree.un_chomsky_normal_form()

        # now the leaft are words. words in terminal_symbols.
        return lexicalize(tree, sent)