def get_subparse(backpointer): parses = [] probability = backpointer.prod.prob( ) # Probability of tree for input S, P(T, S) = P(T)P(S|T) = P(T) if backpointer.l_child is None and backpointer.r_child is None: return [ ProbabilisticTree(str(backpointer.prod.lhs()), [str(backpointer.prod.rhs()[0])], prob=probability) ] left, left_index = backpointer.prod.rhs()[0], backpointer.l_child left_parses = [] for b in table[left_index[0]][left_index[1]][left]: left_parses += get_subparse(b) right, right_index = backpointer.prod.rhs()[1], backpointer.r_child right_parses = [] for b in table[right_index[0]][right_index[1]][right]: right_parses += get_subparse(b) for left_parse in left_parses: for right_parse in right_parses: parses.append( ProbabilisticTree(str(backpointer.prod.lhs()), [left_parse, right_parse], prob=probability * left_parse.prob() * right_parse.prob())) return parses
def parse(self, tokens): try: self.grammar.check_coverage(tokens) except ValueError as v: # print('Words not Found', v) words = v.args[0].split(':')[1].replace('"', '').replace("'", "")[:-1] for word in words.split(','): w = word.strip() if w in tokens: idx = tokens.index(w) tokens[idx] = self.unk parse_table = {} for index in range(len(tokens)): token = tokens[index] parse_table[index, index + 1, token] = token for length in range(1, len(tokens) + 1): for start in range(len(tokens) - length + 1): span = (start, start + length) changed = True while changed: changed = False span_coverage = [] for production in self.grammar.productions(): matching_rules = self.find_matching_rules( production.rhs(), span, parse_table) for matching_rule in matching_rules: span_coverage.append((production, matching_rule)) for (production, children) in span_coverage: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) c = parse_table.get( (span[0], span[1], production.lhs())) if c is None or c.prob() < tree.prob(): parse_table[span[0], span[1], production.lhs()] = tree changed = True tree = parse_table.get((0, len(tokens), self.grammar.start())) # if tree is None: # [print(p, parse_table[p]) for p in parse_table if p[0] == 0 and p[1] == len(tokens)] # [print(p) for p in self.grammar.productions() if p.lhs() == Nonterminal('S')] return tree
def _apply_binary_rules(self, N, chart): """Populate the remainder of the chart, assuming the bottom row is complete. Iterating throught the chart from the bottom up, apply all available binary rules at each position in the chart. Each cell of the chart should enumerate the heads that can be produced there and the score corresponding to their most efficient construction. Args: - N: the number of words - chart: the chart to populate, see _apply_preterminal_rules for a detailed description. """ # Iterate through the chart, handling nonterminal rules A -> B C # Use the ordered_spans function to get a list of spans from the bottom up. for (i, j) in ordered_spans(N): for split in xrange(i+1, j): # Consider all possible A -> B C pass for lhs_tree in chart[(i, split)].values(): for rhs_tree in chart[(split, j)].values(): B = lhs_tree.label() C = rhs_tree.label() for (A, score) in self._grammar.lookup_rhs(B, C): total_score = lhs_tree.logprob() + rhs_tree.logprob() + score if total_score > chart[(i, j)][A].logprob(): chart[(i, j)][A] = ProbabilisticTree(A, [lhs_tree, rhs_tree], logprob=total_score)
def _parse(self, table): """ Helper function of :func:`CKYParser.parse` that implements the actual parsing algorithm. Args: - table :class:`ParserTable`: a probabilistic CKY matrix Returns: list -- all complete parses that could be derived. """ for end in xrange(1, table.num_leaves() + 1): for start in xrange(end - 2, -1, -1): top_node = table.top_node(start, end) trees = [] self.best_prob = 0.0 for split in xrange(start + 1, end): for l, r in product(table[start][split], table[split][end]): for prod in self.find_productions(l, r): # do not add a tree to trees if the top node is # an indexed node or if it is a rewritten # production. if top_node and linking(prod): continue prob = prod.prob() * l.prob() * r.prob() lhs = prod.lhs().symbol() if self.acceptable(lhs, prob, trees): trees.append( ProbabilisticTree(lhs, [l, r], prob=prob)) table[start][end] = trees
def _apply_preterminal_rules(self, words, chart): """Populate the bottom row of the CKY chart. Specifically, apply preterminal unary rules to go from word to preterminal. Args: - words: sequence of words to parse - chart: the chart to populate Returns: False if a preterminal could not be found in the grammar for a word. True otherwise. """ # Handle preterminal rules A -> a # For the ith token, you should populate cell (i,i+1). for i, word in enumerate(words): cell_key = (i,i+1) pass if (word,) not in self._grammar.parsing_index: return False for t in self._grammar.parsing_index[(word,)]: pos_tag = t[0] score = t[1] chart[cell_key][pos_tag] = ProbabilisticTree(pos_tag, [word], logprob=score) return True
def _apply_preterminal_rules(self, words, chart): """Populate the bottom row of the CKY chart. Specifically, apply preterminal unary rules to go from word to preterminal. Hint: use self._grammar.lookup_rhs(word) to enumerate available unary rules and their corresponding scores. Hint: A `chart` is a two level structure. The first key is a tuple representing the span. the second key is a part of speech that can be produced by that span. Finally, the value is a ProbabilisticTree containing the score of the best way to create that part of speech. As with A4 best_cuts_with_trace, it also maintains some book keeping to know how to create it. Concretely... chart[(i, i+1)][pos_tag] = ProbabilisticTree(pos_tag, [word], logprob=score) Args: - words: sequence of words to parse - chart: the chart to populate Returns: False if a preterminal could not be found in the grammar for a word. True otherwise. """ #### YOUR CODE HERE #### # Handle preterminal rules A -> a # For the ith token, you should populate cell (i,i+1). for i, word in enumerate(words): cell_key = (i, i + 1) lhs = self._grammar.lookup_rhs(word) for pos_tag in lhs: pt = ProbabilisticTree(pos_tag[0], [word], logprob=pos_tag[1]) chart[cell_key][pos_tag[0]] = pt if not lhs: return False #### END(YOUR CODE) #### return True
def initialize(tokens): table = ParserTable(tokens) for end in xrange(1, table.num_leaves()+1): table[end-1][end] = [ ProbabilisticTree(p.lhs().symbol(), p.rhs(), prob=p.prob()) for p in grammar.productions(rhs=tokens[end-1])] return table
def get_subparse(backpointer): probability = backpointer.prob # Probability of tree for input S, P(T, S) = P(T)P(S|T) = P(T) if backpointer.l_child is None and backpointer.r_child is None: return ProbabilisticTree(str(backpointer.prod.lhs()), [str(backpointer.prod.rhs()[0])], prob=probability) left, left_index = backpointer.prod.rhs()[0], backpointer.l_child right, right_index = backpointer.prod.rhs()[1], backpointer.r_child left_cell = table[left_index[0]][left_index[1]] right_cell = table[right_index[0]][right_index[1]] left_parse = get_subparse(left_cell[left]) right_parse = get_subparse(right_cell[right]) return ProbabilisticTree(str(backpointer.prod.lhs()), [left_parse, right_parse], prob=probability)
def make_tree(table, splits, left, right, nonterminal): if isinstance(nonterminal, basestring): # actually a terminal -- we're done return nonterminal try: leftsym, rightsym, split = splits[left, right, nonterminal] prob = table[left, right, nonterminal] except KeyError: return None if rightsym is None: # unary production tree = make_tree(table, splits, left, right, leftsym) return ProbabilisticTree(nonterminal.symbol(), [tree], prob=prob) else: left_tree = make_tree(table, splits, left, split, leftsym) right_tree = make_tree(table, splits, split, right, rightsym) return ProbabilisticTree(nonterminal.symbol(), [left_tree, right_tree], prob=prob)
def _apply_binary_rules(self, N, chart): """Populate the remainder of the chart, assuming the bottom row is complete. Iterating throught the chart from the bottom up, apply all available binary rules at each position in the chart. Each cell of the chart should enumerate the heads that can be produced there and the score corresponding to their most efficient construction. Hint: self._grammar.lookup_rhs(B, C) will return a list of binary production rules of the form A -> B, C along with their score. Hint: When building the backpointers in this function, provide the full left and right trees as (left, right), not just the immediate children. Args: - N: the number of words - chart: the chart to populate, see _apply_preterminal_rules for a detailed description. """ #### YOUR CODE HERE #### # Iterate through the chart, handling nonterminal rules A -> B C # Use the ordered_spans function to get a list of spans from the bottom up. for (i, j) in ordered_spans(N): for split in range(i + 1, j): #print ("split", split) # Consider all possible A -> B C B_lhs = [key for (key, val) in chart[(i, split)].items()] #print ("B_lhs",B_lhs) C_lhs = [key for (key, val) in chart[(split, j)].items()] #print ("C_lhs",C_lhs) for B in B_lhs: #print ("B",B) for C in C_lhs: #print("C",C) for A, weight_A_BC in self._grammar.lookup_rhs(B, C): #print ("A",A) #print ("weight ABC", weight_A_BC) #print ("logprob", chart[(i, j)][A].logprob()) ### Score calculation score = chart[(i, split)][B].logprob() + chart[ (split, j)][C].logprob() + weight_A_BC if score > chart[(i, j)][A].logprob(): #print ("Boo",score > chart[(i, j)][A].logprob()) chart[(i, j)][A] = ProbabilisticTree( A, (chart[(i, split)][B], chart[(split, j)][C]), logprob=score)
def pcky(sentence, pcfg): """ Probabilistic CKY algorithm. :param sentence: List[str] :param pcfg: nltk.PCFG :return: parsed result """ p_rules = pcfg.productions() symbol_map = create_map(p_rules) # map RHS to a set of LHS according to the grammar d = len(sentence) # d is the length of the sentence table = initialize_table(d + 1) # initialize table for pcky for j in range(1, len(sentence)+1): j_tuple = tuple([sentence[j-1]]) if j_tuple in symbol_map: for symbol_prob in symbol_map[j_tuple]: # for all terminals table[j - 1][j].append(ProbabilisticTree(symbol_prob[0], [sentence[j-1]], prob=symbol_prob[1])) else: j_tuple = tuple(['*unknown*']) for symbol_prob in symbol_map[j_tuple]: # for all terminals table[j - 1][j].append(ProbabilisticTree(symbol_prob[0], [sentence[j-1]], prob=symbol_prob[1])) for i in range(j - 2, -1, -1): # from j-2 to 0 for k in range(i + 1, j): # from i+1 to j-1 table = update_cell(table, i, k, j, symbol_map) return table
def update_cell(table, i, k, j, symbol_map): """ For current cell [i,j], update tree list and probability based on cell [i,k] and [k,j] :return: updated table """ for s1 in table[i, k]: for s2 in table[k, j]: # s1 and s2 are trees rhs = (s1.label(), s2.label()) if rhs in symbol_map: # check if rhs in current grammar's rules lhs = symbol_map[rhs] # max_prob = -1 # best_symbol_prob = None for l_symbol_prob in lhs: # add current tree to cell [i,j] table[i, j].append( ProbabilisticTree(l_symbol_prob[0], [s1, s2], prob=l_symbol_prob[1] * s1.prob() * s2.prob())) table[i, j] = sorted(table[i, j], key=ProbabilisticTree.prob, reverse=True)[:28] return table
def _apply_binary_rules(self, N, chart): """Populate the remainder of the chart, assuming the bottom row is complete. Iterating throught the chart from the bottom up, apply all available binary rules at each position in the chart. Each cell of the chart should enumerate the heads that can be produced there and the score corresponding to their most efficient construction. Hint: self._grammar.lookup_rhs(B, C) will return a list of binary production rules of the form A -> B, C along with their score. Hint: When building the backpointers in this function, provide the full left and right trees as (left, right), not just the immediate children. Args: - N: the number of words - chart: the chart to populate, see _apply_preterminal_rules for a detailed description. """ #### YOUR CODE HERE #### # Iterate through the chart, handling nonterminal rules A -> B C # Use the ordered_spans function to get a list of spans from the bottom up. for (i, j) in ordered_spans(N): for split in range(i + 1, j): # Consider all possible A -> B C for B in chart[(i, split)].keys(): for C in chart[(split, j)].keys(): for A, weight in self._grammar.lookup_rhs(B, C): B_t = chart[(i, split)][B] C_t = chart[(split, j)][C] x = B_t.logprob() + C_t.logprob() + weight if x > chart[(i, j)][A].logprob(): chart[(i, j)][A] = ProbabilisticTree(A, [B_t, C_t], logprob=x)
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover ``span``, and add them to the most likely constituents table. :rtype: None :type span: tuple(int, int) :param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover ``text[span[0]:span[1]]``, where ``text`` is the text that we are parsing. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, ``constituents(s,e,nv)`` is the most likely ``ProbabilisticTree`` that covers ``text[s:e]`` and has a node value ``nv.symbol()``, where ``text`` is the text that we are parsing. When ``_add_constituents_spanning`` is called, ``constituents`` should contain all possible constituents that are shorter than ``span``. :type tokens: list of tokens :param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents, tokens) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print(' Insert:', end=' ') else: print(' Discard:', end=' ') self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover C{span}, and add them to the most likely constituents table. @rtype: C{None} @type span: C{(int, int)} @param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text that we are parsing. @type constituents: C{dictionary} from C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or C{ProbabilisticTree}). @param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, C{constituents(M{s},M{e},M{nv})} is the most likely C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]} and has a node value C{M{nv}.symbol()}, where C{M{text}} is the text that we are parsing. When C{_add_constituents_spanning} is called, C{constituents} should contain all possible constituents that are shorter than C{span}. @type tokens: C{list} of tokens @param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr,t:pr*t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print ' Insert:', else: print ' Discard:', self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover ``span``, and add them to the most likely constituents table. :rtype: None :type span: tuple(int, int) :param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover ``text[span[0]:span[1]]``, where ``text`` is the text that we are parsing. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, ``constituents(s,e,nv)`` is the most likely ``ProbabilisticTree`` that covers ``text[s:e]`` and has a node value ``nv.symbol()``, where ``text`` is the text that we are parsing. When ``_add_constituents_spanning`` is called, ``constituents`` should contain all possible constituents that are shorter than ``span``. :type tokens: list of tokens :param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr,t:pr*t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print(' Insert:', end=' ') else: print(' Discard:', end=' ') self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True
def make_chart(): """Create an empty chart.""" dummy_tree_factory = lambda: ProbabilisticTree('', [], logprob=-np.inf) cell_factory = lambda: collections.defaultdict(dummy_tree_factory) return collections.defaultdict(cell_factory)
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover C{span}, and add them to the most likely constituents table. @rtype: C{None} @type span: C{(int, int)} @param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text that we are parsing. @type constituents: C{dictionary} from C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or C{ProbabilisticTree}). @param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, C{constituents(M{s},M{e},M{nv})} is the most likely C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]} and has a node value C{M{nv}.symbol()}, where C{M{text}} is the text that we are parsing. When C{_add_constituents_spanning} is called, C{constituents} should contain all possible constituents that are shorter than C{span}. @type tokens: C{list} of tokens @param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print ' Insert:', else: print ' Discard:', self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True