def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. start -- start symbol. horzMarkov -- None for default. A number n >= 0 for horizontal markov. """ self.start = start count_Y_Z = defaultdict(lambda: defaultdict(int)) count_X = defaultdict(int) for t in parsed_sents: # it's a copy of tree. We don't want to modify the original tree. # mutable structures unle_trees = unlexicalize(t.copy(deep=True)) # chomsky normal form with horizontal markov. unle_trees.chomsky_normal_form(horzMarkov=horzMarkov) # collapse subtrees with a single child. unle_trees.collapse_unary(collapsePOS=True) for prod in unle_trees.productions(): count_Y_Z[prod.lhs()][prod.rhs()] += 1 count_X[prod.lhs()] += 1 # create a list of productions. productions = [] for X, c_X in count_X.items(): for (Y_Z, c_Y_Z) in count_Y_Z[X].items(): q = c_Y_Z / float(c_X) productions.append(ProbabilisticProduction(X, Y_Z, prob=q)) self.production = productions grammar = PCFG(Nonterminal(start), productions) self.parser = CKYParser(grammar)
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # { A -> B : count(A -> B) } productions_counts = defaultdict(int) # { A : count(A) } lhs_count = defaultdict(int) # left_hand_side_count self.start = start # Para la gramatica del parser CKY self.prods = [] # Lista de producciones # Hacemos una copia de t porque al hacer el unlexicalize, este me # modifica el arbol # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents] unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents] for t in unlex_sents: t.chomsky_normal_form(horzMarkov=horzMarkov) t.collapse_unary(collapsePOS=True, collapseRoot=True) for prod in t.productions(): # type(prod): <class 'nltk.grammar.Production'> # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> # Cada elemento de prod.rhs() es del tipo: # <class 'nltk.grammar.Nonterminal'> productions_counts[prod] += 1 lhs_count[prod.lhs()] += 1 for prod, count_prod in productions_counts.items(): # type(production): <class 'nltk.grammar.Production'> # production : A -> B # type(count_prod): int # count_prod : count(A -> B) count_lhs = lhs_count.get(prod.lhs(), 0) # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> q_ML = float(count_prod) / count_lhs self.prods += [ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=q_ML)] # Cada elemento de self.prods es del tipo: # <class 'nltk.grammar.ProbabilisticProduction'> # type(PCFG(...)) = <class 'nltk.grammar.PCFG'> # PCFG(start, productions) # type(start): Nonterminal # type(productions): list(Production) grammar = PCFG(Nonterminal(start), self.prods) self.my_parser = CKYParser(grammar)
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # Non-Terminal start symbol of the pcfg. # Be aware that the start symbol now is specified by the init parameter # 'start', and not the start label of the trees in parsed_sents self.start = N(start) self.horzMarkov = horzMarkov # saving repeated productions (for induce probabilities) productions = [] for t in parsed_sents: unlex_t = unlexicalize(t.copy(deep=True)) # Set node label unlex_t.set_label(start) unlex_t.chomsky_normal_form(horzMarkov=horzMarkov) # Not collapsing the Root (collapseRoot=False) unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True) productions += unlex_t.productions() self.pcfg = induce_pcfg(self.start, productions) self._probabilistic_productions = self.pcfg.productions() self._parser = CKYParser(self.pcfg)
def test_parse(self): grammar = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det Noun [0.6] NP -> Noun Adj [0.4] VP -> Verb NP [1.0] Det -> 'el' [1.0] Noun -> 'gato' [0.9] Noun -> 'pescado' [0.1] Verb -> 'come' [1.0] Adj -> 'crudo' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('el gato come pescado crudo'.split()) # check chart pi = { (1, 1): { 'Det': log2(1.0) }, (2, 2): { 'Noun': log2(0.9) }, (3, 3): { 'Verb': log2(1.0) }, (4, 4): { 'Noun': log2(0.1) }, (5, 5): { 'Adj': log2(1.0) }, (1, 2): { 'NP': log2(0.6 * 1.0 * 0.9) }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': log2(0.4 * 0.1 * 1.0) }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0) }, (1, 4): {}, (2, 5): {}, (1, 5): { 'S': log2(1.0) + # rule S -> NP VP log2(0.6 * 1.0 * 0.9) + # left part log2(1.0) + log2(1.0) + log2(0.4 * 0.1 * 1.0) }, # right part } self.assertEqualPi(parser._pi, pi) # check partial results bp = { (1, 1): { 'Det': Tree.fromstring("(Det el)") }, (2, 2): { 'Noun': Tree.fromstring("(Noun gato)") }, (3, 3): { 'Verb': Tree.fromstring("(Verb come)") }, (4, 4): { 'Noun': Tree.fromstring("(Noun pescado)") }, (5, 5): { 'Adj': Tree.fromstring("(Adj crudo)") }, (1, 2): { 'NP': Tree.fromstring("(NP (Det el) (Noun gato))") }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': Tree.fromstring("(NP (Noun pescado) (Adj crudo))") }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': Tree.fromstring( "(VP (Verb come) (NP (Noun pescado) (Adj crudo)))") }, (1, 4): {}, (2, 5): {}, (1, 5): { 'S': Tree.fromstring("""(S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) }, } self.assertEqual(parser._bp, bp) # check tree t2 = Tree.fromstring(""" (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) self.assertEqual(t, t2) # check log probability lp2 = log2(1.0 * 0.6 * 1.0 * 0.9 * 1.0 * 1.0 * 0.4 * 0.1 * 1.0) self.assertAlmostEqual(lp, lp2)
def test_parse_2(self): grammar = PCFG.fromstring(""" S -> NP VP [1.0] NP -> NP PP [0.5] NP -> Det Noun [0.5] VP -> VP PP [0.9] VP -> Verb NP [0.1] PP -> Prep NP [1.0] Noun -> 'dog' [0.2] Noun -> 'man' [0.2] Noun -> 'town' [0.6] Verb -> 'saw' [1.0] Prep -> 'in' [1.0] Det -> 'the' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('the man saw the dog in the town'.split()) # check chart pi = { (1, 1): { 'Det': log2(1.0) }, (2, 2): { 'Noun': log2(0.2) }, (3, 3): { 'Verb': log2(1.0) }, (4, 4): { 'Det': log2(1.0) }, (5, 5): { 'Noun': log2(0.2) }, (6, 6): { 'Prep': log2(1.0) }, (7, 7): { 'Det': log2(1.0) }, (8, 8): { 'Noun': log2(0.6) }, (1, 2): { 'NP': -3.321928094887362 }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': -3.321928094887362 }, (5, 6): {}, (6, 7): {}, (7, 8): { 'NP': -1.736965594166206 }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': -6.643856189774724 }, (4, 6): {}, (5, 7): {}, (6, 8): { 'PP': -1.736965594166206 }, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): { 'S': -9.965784284662087 }, (2, 6): {}, (3, 7): {}, (4, 8): { 'NP': -6.058893689053567 }, (1, 6): {}, (2, 7): {}, (3, 8): { 'VP': -8.53282487738598 }, (1, 7): {}, (2, 8): {}, (1, 8): { 'S': -11.854752972273342 }, } self.assertEqualPi(parser._pi, pi) bp = { (1, 1): { 'Det': Tree.fromstring('(Det the)') }, (2, 2): { 'Noun': Tree.fromstring('(Noun man)') }, (3, 3): { 'Verb': Tree.fromstring('(Verb saw)') }, (4, 4): { 'Det': Tree.fromstring('(Det the)') }, (5, 5): { 'Noun': Tree.fromstring('(Noun dog)') }, (6, 6): { 'Prep': Tree.fromstring('(Prep in)') }, (7, 7): { 'Det': Tree.fromstring('(Det the)') }, (8, 8): { 'Noun': Tree.fromstring('(Noun town)') }, (1, 2): { 'NP': Tree.fromstring('(NP (Det the) (Noun man))') }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': Tree.fromstring('(NP (Det the) (Noun dog))') }, (5, 6): {}, (6, 7): {}, (7, 8): { 'NP': Tree.fromstring('(NP (Det the) (Noun town))') }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': Tree.fromstring('(VP (Verb saw) (NP (Det the) (Noun dog)))') }, (4, 6): {}, (5, 7): {}, (6, 8): { 'PP': Tree.fromstring('(PP (Prep in) (NP (Det the) (Noun town)))') }, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): { 'S': Tree.fromstring("""(S (NP (Det the) (Noun man)) (VP (Verb saw) (NP (Det the) (Noun dog))))""") }, (2, 6): {}, (3, 7): {}, (4, 8): { 'NP': Tree.fromstring("""(NP (NP (Det the) (Noun dog)) (PP (Prep in) (NP (Det the) (Noun town))))""") }, (1, 6): {}, (2, 7): {}, (3, 8): { 'VP': Tree.fromstring("""(VP (VP (Verb saw) (NP (Det the) (Noun dog))) (PP (Prep in) (NP (Det the) (Noun town))))""") }, (1, 7): {}, (2, 8): {}, (1, 8): { 'S': Tree.fromstring("""(S (NP (Det the) (Noun man)) (VP (VP (Verb saw) (NP (Det the) (Noun dog))) (PP (Prep in) (NP (Det the) (Noun town)))))""") }, } self.assertEqual(parser._bp, bp)
def test_parse_ambiguity(self): # Ejemplo tomado de las paginas 4, 5, 8 de las notas de Michael Collins # Probabilistic Context-Free Grammars (PCFGs) grammar = PCFG.fromstring(""" S -> NP VP [1.0] VP -> Vt NP [0.65] VP -> VP PP [0.35] NP -> DT NN [0.8] NP -> NP PP [0.2] PP -> IN NP [1.0] Vt -> saw [1.0] NN -> man [0.2] NN -> telescope [0.3] NN -> dog [0.5] DT -> the [1.0] IN -> with [1.0] """) # Cambiando esto: # VP -> Vt NP [0.85] # VP -> VP PP [0.15] # Obtengo el otro arbol parser = CKYParser(grammar) lp, t = parser.parse('the man saw the dog with the telescope'.split()) # draw_trees(t) # check tree t2 = Tree.fromstring(""" (S (NP (DT the) (NN man) ) (VP (VP (Vt saw) (NP (DT the) (NN dog) ) ) (PP (IN with) (NP (DT the) (NN telescope) ) ) ) ) """) self.assertEqual(t, t2) # check log probability lp2 = log2(1.0 * 0.8 * 1.0 * 0.2 * 0.35 * 0.65 * 1.0 * 0.8 * 1.0 * 0.5 * 1.0 * 1.0 * 0.8 * 1.0 * 0.3) self.assertAlmostEqual(lp, lp2)
def test_ambiguo(self): grammar = PCFG.fromstring(""" S -> NP VP [1.0] VP -> Vt NP [0.3] VP -> VP PP [0.7] NP -> NP PP [0.6] NP -> DT NN [0.4] PP -> IN NP [1.0] Vt -> 'saw' [1.0] NN -> 'man' [0.33] NN -> 'telescope' [0.33] NN -> 'dog' [0.34] DT -> 'the' [1.0] IN -> 'with' [1.0] """) parser = CKYParser(grammar) lp, t = parser.parse('the man saw the dog with the telescope'.split()) pi = { (1, 1): { 'DT': 0.0 }, (2, 2): { 'NN': -1.5994620704162712 }, (3, 3): { 'Vt': 0.0 }, (4, 4): { 'DT': 0.0 }, (5, 5): { 'NN': -1.5563933485243853 }, (6, 6): { 'IN': 0.0 }, (7, 7): { 'DT': 0.0 }, (8, 8): { 'NN': -1.5994620704162712 }, (1, 2): { 'NP': -2.9213901653036336 }, (2, 3): {}, (3, 4): {}, (4, 5): { 'NP': -2.8783214434117474 }, (5, 6): {}, (6, 7): {}, (7, 8): { 'NP': -2.9213901653036336 }, (1, 3): {}, (2, 4): {}, (3, 5): { 'VP': -4.6152870375779536 }, (4, 6): {}, (5, 7): {}, (6, 8): { 'PP': -2.9213901653036336 }, (1, 4): {}, (2, 5): {}, (3, 6): {}, (4, 7): {}, (5, 8): {}, (1, 5): { 'S': -7.536677202881587 }, (2, 6): {}, (3, 7): {}, (4, 8): { 'NP': -6.536677202881587 }, (1, 6): {}, (2, 7): {}, (3, 8): { 'VP': -8.051250375711346 }, (1, 7): {}, (2, 8): {}, (1, 8): { 'S': -10.972640541014979 } } self.assertEqualPi(parser._pi, pi) t2 = Tree.fromstring(""" (S (NP (DT the) (NN man)) (VP (VP (Vt saw) (NP (DT the) (NN dog))) (PP (IN with) (NP (DT the) (NN telescope))))) """) self.assertEqual(t, t2)