def parse(self, tokens): try: self.grammar.check_coverage(tokens) except ValueError as v: # print('Words not Found', v) words = v.args[0].split(':')[1].replace('"', '').replace("'", "")[:-1] for word in words.split(','): w = word.strip() if w in tokens: idx = tokens.index(w) tokens[idx] = self.unk parse_table = {} for index in range(len(tokens)): token = tokens[index] parse_table[index, index + 1, token] = token for length in range(1, len(tokens) + 1): for start in range(len(tokens) - length + 1): span = (start, start + length) changed = True while changed: changed = False span_coverage = [] for production in self.grammar.productions(): matching_rules = self.find_matching_rules( production.rhs(), span, parse_table) for matching_rule in matching_rules: span_coverage.append((production, matching_rule)) for (production, children) in span_coverage: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) c = parse_table.get( (span[0], span[1], production.lhs())) if c is None or c.prob() < tree.prob(): parse_table[span[0], span[1], production.lhs()] = tree changed = True tree = parse_table.get((0, len(tokens), self.grammar.start())) # if tree is None: # [print(p, parse_table[p]) for p in parse_table if p[0] == 0 and p[1] == len(tokens)] # [print(p) for p in self.grammar.productions() if p.lhs() == Nonterminal('S')] return tree
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover ``span``, and add them to the most likely constituents table. :rtype: None :type span: tuple(int, int) :param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover ``text[span[0]:span[1]]``, where ``text`` is the text that we are parsing. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, ``constituents(s,e,nv)`` is the most likely ``ProbabilisticTree`` that covers ``text[s:e]`` and has a node value ``nv.symbol()``, where ``text`` is the text that we are parsing. When ``_add_constituents_spanning`` is called, ``constituents`` should contain all possible constituents that are shorter than ``span``. :type tokens: list of tokens :param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents, tokens) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print(' Insert:', end=' ') else: print(' Discard:', end=' ') self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover C{span}, and add them to the most likely constituents table. @rtype: C{None} @type span: C{(int, int)} @param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text that we are parsing. @type constituents: C{dictionary} from C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or C{ProbabilisticTree}). @param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, C{constituents(M{s},M{e},M{nv})} is the most likely C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]} and has a node value C{M{nv}.symbol()}, where C{M{text}} is the text that we are parsing. When C{_add_constituents_spanning} is called, C{constituents} should contain all possible constituents that are shorter than C{span}. @type tokens: C{list} of tokens @param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr,t:pr*t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print ' Insert:', else: print ' Discard:', self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover ``span``, and add them to the most likely constituents table. :rtype: None :type span: tuple(int, int) :param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover ``text[span[0]:span[1]]``, where ``text`` is the text that we are parsing. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, ``constituents(s,e,nv)`` is the most likely ``ProbabilisticTree`` that covers ``text[s:e]`` and has a node value ``nv.symbol()``, where ``text`` is the text that we are parsing. When ``_add_constituents_spanning`` is called, ``constituents`` should contain all possible constituents that are shorter than ``span``. :type tokens: list of tokens :param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr,t:pr*t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print(' Insert:', end=' ') else: print(' Discard:', end=' ') self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True
def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover C{span}, and add them to the most likely constituents table. @rtype: C{None} @type span: C{(int, int)} @param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text that we are parsing. @type constituents: C{dictionary} from C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or C{ProbabilisticTree}). @param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, C{constituents(M{s},M{e},M{nv})} is the most likely C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]} and has a node value C{M{nv}.symbol()}, where C{M{text}} is the text that we are parsing. When C{_add_constituents_spanning} is called, C{constituents} should contain all possible constituents that are shorter than C{span}. @type tokens: C{list} of tokens @param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print ' Insert:', else: print ' Discard:', self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True