def test_la_viterbi_parsing_3(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() print(nontMap.object_index("S")) print(nontMap.object_index("B")) la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.latent_viterbi_derivation(True) print(der) der2 = None for w, der_ in parser.k_best_derivation_trees(): if der2 is None: der2 = der_ print(w, der_) print(der2)
def __test_projection(self, split_weights, goal_weights, merge_method=False): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "A"]) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) lhs = LCFRS_lhs("A") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], weight=2.0) grammar.make_proper() # print(grammar) nonterminal_map = Enumerator() grammarInfo = PyGrammarInfo(grammar, nonterminal_map) storageManager = PyStorageManager() la = build_PyLatentAnnotation([1, 2], [1.0], split_weights, grammarInfo, storageManager) # parser = LCFRS_parser(grammar) # parser.set_input(["a", "b"]) # parser.parse() # der = parser.best_derivation_tree() # print(la.serialize()) if merge_method: la.project_weights(grammar, grammarInfo) else: splits, _, _ = la.serialize() merge_sources = [[[ split for split in range(0, splits[nont_idx]) ]] for nont_idx in range(0, nonterminal_map.get_counter())] # print("Projecting to fine grammar LA", file=self.logger) coarse_la = la.project_annotation_by_merging(grammarInfo, merge_sources, debug=False) coarse_la.project_weights(grammar, grammarInfo) # print(grammar) for i in range(3): self.assertAlmostEqual( grammar.rule_index(i).weight(), goal_weights[i])
def test_projection_based_parser_k_best_hack(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = Coarse_to_fine_parser(grammar, la, gi, nontMap, base_parser_type=GFParser_k_best) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.max_rule_product_derivation() print(der) der = parser.best_derivation_tree() print(der) for node in der.ids(): print(der.getRule(node), der.spanned_ranges(node))
def __str__(self): s = '{' + ','.join([ '{' + ','.join([ str(self.range(LCFRS_var(-1, arg))) for arg in range(self.complete_to() + 1) ]) + '}' ] + [ '{' + ','.join([ str(self.range(LCFRS_var(mem, arg))) for arg in range(self.max_arg(mem) + 1) ]) + '}' for mem in range(self.max_mem() + 1) ]) + '}' return '[' + self.action_id() + ':' + str(self._rule) + ':' + s + ']'
def induce_grammar_from(dsg, rec_par, decomp, labeling=(lambda x, y: str(x)), terminal_labeling=id, terminal_labeling_lcfrs=None, start="START", normalize=True, enforce_outputs=True): if terminal_labeling_lcfrs is None: terminal_labeling_lcfrs = terminal_labeling lcfrs = LCFRS(start=start) ordered_nodes = dsg.dog.ordered_nodes() rhs_nont = induce_grammar_rec(lcfrs, dsg, rec_par, decomp, labeling, terminal_labeling, terminal_labeling_lcfrs , normalize, enforce_outputs, ordered_nodes=ordered_nodes) rhs_top = dsg.dog.top(decomp[0]) # construct a chain rule from START to initial nonterminal of decomposition # LCFRS part lcfrs_lhs = LCFRS_lhs(start) lcfrs_lhs.add_arg([LCFRS_var(0, 0)]) # DOG part dog = DirectedOrderedGraph() assert len(dsg.dog.inputs) == 0 assert not enforce_outputs or len(dsg.dog.outputs) > 0 for i in range(len(rhs_top)): dog.add_node(i) for output in dsg.dog.outputs: dog.add_to_outputs(rhs_top.index(output)) dog.add_nonterminal_edge([], [i for i in range(len(rhs_top))], enforce_outputs) # no sync sync = [] lcfrs.add_rule(lcfrs_lhs, [rhs_nont], weight=1.0, dcp=[dog, sync]) return lcfrs
def build_grammar(self): grammar = LCFRS("S") lhs1 = LCFRS_lhs("S") lhs1.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) rule_1 = grammar.add_rule(lhs1, ["S", "S"]) lhs2 = LCFRS_lhs("S") lhs2.add_arg(["a"]) rule_2 = grammar.add_rule(lhs2, []) lhs3 = LCFRS_lhs("A") lhs3.add_arg(["a"]) rule_3 = grammar.add_rule(lhs3, []) return grammar, rule_1.get_idx(), rule_2.get_idx()
def build_paper_grammar(): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"]) # rule 2 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0,0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["B", "B"]) grammar.make_proper() return grammar
def __spanned_input_by(self, id): passive_item = self.__derivationItems[id] assert isinstance(passive_item, DerivationItem) spanned_input_positions = [] for component in range(passive_item.fanout()): r = passive_item.range(LCFRS_var(-1, component)) assert isinstance(r, Range) spanned_input_positions += range(r.left, r.right) return spanned_input_positions
def build_grammar(): grammar = LCFRS("START") # rule 0 lhs = LCFRS_lhs("START") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["S"]) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["S", "S"]) # rule 1.5 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["S", "S"], dcp=["1.5"]) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) # rule 3 lhs = LCFRS_lhs("S") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], weight=2.0) # rule 4 lhs = LCFRS_lhs("S") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], dcp=["4"]) # rule 5 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) grammar.make_proper() return grammar
def fill_lcfrs_lhs(lhs, sent_positions, children, sentence, terminal_labeling): """ Create the LCFRS_lhs of some LCFRS-DCP hybrid rule. :rtype: list[int] :param tree: HybridTree :param node_ids: list of string (node in an recursive partitioning) :param t_max: top_max of node_ids :param b_max: bottom_max of node ids :param children: list of pairs of list of list of string # (pairs of top_max / bottom_max of child nodes in recursive partitioning) :type nont_labelling: AbstractLabeling :return: List of sentence position generated by this rule in ascending order. """ spans = join_spans(sent_positions) children_spans = list(map(join_spans, children)) generated_sentence_positions = [] for (low, high) in spans: arg = [] i = low while i <= high: mem = 0 match = False while mem < len(children_spans) and not match: child_spans = children_spans[mem] mem_arg = 0 while mem_arg < len(child_spans) and not match: child_span = child_spans[mem_arg] if child_span[0] == i: arg.append(LCFRS_var(mem, mem_arg)) i = child_span[1] + 1 match = True mem_arg += 1 mem += 1 # Add terminal if not match: arg.append(terminal_labeling(sentence[i])) generated_sentence_positions.append(i) i += 1 # raise Exception('Expected ingredient for LCFRS argument was not found.') lhs.add_arg(arg) return generated_sentence_positions
def direct_extract_lcfrs_from_prebinarized_corpus(tree, term_labeling=PosTerminals(), nont_labeling=BasicNonterminalLabeling(), isolate_pos=True): gram = LCFRS(start=START) root = tree.root[0] if root in tree.full_yield(): lhs = LCFRS_lhs(START) label = term_labeling.token_label(tree.node_token(root)) lhs.add_arg([label]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge()), [])]) gram.add_rule(lhs, [], dcp=[dcp_rule]) else: first, _, _ = direct_extract_lcfrs_prebinarized_recur(tree, root, gram, term_labeling, nont_labeling, isolate_pos) lhs = LCFRS_lhs(START) lhs.add_arg([LCFRS_var(0, 0)]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) gram.add_rule(lhs, [first], dcp=[dcp_rule]) return gram
def create_lcfrs_lhs(tree, node_ids, t_max, b_max, children, nont_labelling): """ Create the LCFRS_lhs of some LCFRS-DCP hybrid rule. :rtype: LCFRS_lhs :param tree: HybridTree :param node_ids: list of string (node in an recursive partitioning) :param t_max: top_max of node_ids :param b_max: bottom_max of node ids :param children: list of pairs of list of list of string # (pairs of top_max / bottom_max of child nodes in recursive partitioning) :type nont_labelling: AbstractLabeling :return: LCFRS_lhs :raise Exception: """ positions = map(tree.node_index, node_ids) spans = join_spans(positions) children_spans = list(map(join_spans, [map(tree.node_index, ids) for (ids, _) in children])) lhs = LCFRS_lhs(nont_labelling.label_nonterminal(tree, node_ids, t_max, b_max, len(spans))) for (low, high) in spans: arg = [] i = low while i <= high: mem = 0 match = False while mem < len(children_spans) and not match: child_spans = children_spans[mem] mem_arg = 0 while mem_arg < len(child_spans) and not match: child_span = child_spans[mem_arg] if child_span[0] == i: arg.append(LCFRS_var(mem, mem_arg)) i = child_span[1] + 1 match = True mem_arg += 1 mem += 1 # Sanity check if not match: raise Exception('Expected ingredient for LCFRS argument was not found.') lhs.add_arg(arg) return lhs
def induce_grammar(trees, nont_labelling, term_labelling, recursive_partitioning, start_nont='START'): """ :rtype: LCFRS :param trees: corpus of HybridTree (i.e. list (or Generator for lazy IO)) :type trees: __generator[HybridTree] :type nont_labelling: AbstractLabeling :param term_labelling: HybridTree, NodeId -> str :param recursive_partitioning: HybridTree -> RecursivePartitioning :type start_nont: str :rtype: int, LCFRS Top level method to induce an LCFRS/DCP-hybrid grammar for dependency parsing. """ grammar = LCFRS(start_nont) n_trees = 0 for tree in trees: n_trees += 1 for rec_par in recursive_partitioning: match = re.search(r'no_new_nont', rec_par.__name__) if match: rec_par_int = rec_par(tree, grammar.nonts(), nont_labelling) else: rec_par_int = rec_par(tree) rec_par_nodes = tree.node_id_rec_par(rec_par_int) (_, _, nont_name) = add_rules_to_grammar_rec(tree, rec_par_nodes, grammar, nont_labelling, term_labelling) # Add rule from top start symbol to top most nonterminal for the hybrid tree lhs = LCFRS_lhs(start_nont) lhs.add_arg([LCFRS_var(0, 0)]) rhs = [nont_name] dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) grammar.add_rule(lhs, rhs, 1.0, [dcp_rule]) grammar.make_proper() return n_trees, grammar
def direct_extract_lcfrs_prebinarized_recur(tree, idx, gram, term_labeling, nont_labeling, isolate_pos): assert isinstance(tree, HybridDag) fringe = tree.fringe(idx) spans = join_spans(fringe) nont_fanout = len(spans) _bot = list(bottom(tree, [idx] + tree.descendants(idx))) _top = list(top(tree, [idx] + tree.descendants(idx))) nont = nont_labeling.label_nont(tree, idx) + '/' + '/'.join( map(str, [nont_fanout, len(_bot), len(_top)])) lhs = LCFRS_lhs(nont) if idx in tree.full_yield(): label = term_labeling.token_label(tree.node_token(idx)) lhs.add_arg([label]) dcp_rule = DCP_rule(DCP_var(-1, 0), [ DCP_term(DCP_index(0, edge_label=tree.node_token(idx).edge()), []) ]) gram.add_rule(lhs, [], dcp=[dcp_rule]) return lhs.nont(), _bot, _top if not len(tree.children(idx)) <= 2: raise ValueError("Tree is not prebinarized!", tree, idx) children = [(child, join_spans(tree.fringe(child))) for child in tree.children(idx)] edge_labels = [] for (low, high) in spans: arg = [] pos = low while pos <= high: child_num = 0 for i, (child, child_spans) in enumerate(children): for j, (child_low, child_high) in enumerate(child_spans): if pos == child_low: if child in tree.full_yield() and not isolate_pos: arg += [ term_labeling.token_label( tree.node_token(child)) ] edge_labels += [tree.node_token(child).edge()] else: arg += [LCFRS_var(child_num, j)] pos = child_high + 1 if child not in tree.full_yield() or isolate_pos: child_num += 1 lhs.add_arg(arg) dcp_term_args = [] rhs = [] nont_counter = 0 term_counter = 0 cbots = [] ctops = [] for (child, child_spans) in children: if child not in tree.full_yield() or isolate_pos: c_nont, _cbot, _ctop = direct_extract_lcfrs_prebinarized_recur( tree, child, gram, term_labeling, nont_labeling, isolate_pos) rhs.append(c_nont) cbots.append(_cbot) ctops.append(_ctop) dcp_term_args.append( DCP_var(nont_counter, len(_cbot) + _ctop.index(child))) nont_counter += 1 else: dcp_term_args.append( DCP_term( DCP_index(term_counter, edge_label=edge_labels[term_counter]), [])) term_counter += 1 for sec, sec_child in enumerate(tree.sec_children(idx)): if sec_child not in tree.descendants(idx): print(idx, "has external", sec_child) assert sec_child in _bot dcp_term_args.append( DCP_term(DCP_string("SECEDGE"), [DCP_var(-1, _bot.index(sec_child))])) else: print(idx, "has internal", sec_child) assert False dcp_lhs = DCP_var(-1, len(_bot) + _top.index(idx)) label = tree.node_token(idx).category() if re.match(r'.*\|<.*>', label): dcp_term = dcp_term_args else: dcp_term = [ DCP_term(DCP_string(label, edge_label=tree.node_token(idx).edge()), dcp_term_args) ] dcp_rule = DCP_rule(dcp_lhs, dcp_term) dcp_rules = [dcp_rule] for top_idx in _top: if top_idx != idx: # must be in some child rule = None for nont_counter, _ctop in enumerate(ctops): if top_idx in _ctop: rule = DCP_rule( DCP_var(-1, len(_bot) + _top.index(top_idx)), [ DCP_var( nont_counter, len(cbots[nont_counter]) + _ctop.index(top_idx)) ]) break assert rule is not None dcp_rules.append(rule) for nont_counter, _cbot in enumerate(cbots): for bot_idx in _cbot: rule = None rule_lhs = DCP_var(nont_counter, _cbot.index(bot_idx)) if bot_idx in _bot: rule = DCP_rule(rule_lhs, [DCP_var(-1, _bot.index(bot_idx))]) else: for nont_counter2, _ctop in enumerate(ctops): if bot_idx in _ctop: rule = DCP_rule(rule_lhs, [ DCP_var( nont_counter2, len(cbots[nont_counter2]) + _ctop.index(bot_idx)) ]) break assert rule is not None dcp_rules.append(rule) gram.add_rule(lhs, rhs, dcp=dcp_rules) return nont, _bot, _top
def build_nm_grammar(): grammar = LCFRS("START") # rule 0 lhs = LCFRS_lhs("START") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["S"]) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0), LCFRS_var(0, 1), LCFRS_var(1, 1)]) grammar.add_rule(lhs, ["N", "M"]) for nont, term in [("A", "a"), ("B", "b"), ("C", "c"), ("D", "d")]: # rule 2 lhs = LCFRS_lhs(nont) lhs.add_arg([term]) grammar.add_rule(lhs, []) for nont, nont_, c1, c2 in [("N", "N'", "A", "C"), ("M", "M'", "B", "D")]: # rule 3 lhs = LCFRS_lhs(nont) lhs.add_arg([LCFRS_var(0, 0)]) lhs.add_arg([LCFRS_var(1, 0)]) grammar.add_rule(lhs, [c1, c2]) # rule 4 lhs = LCFRS_lhs(nont) lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) lhs.add_arg([LCFRS_var(0,1)]) grammar.add_rule(lhs, [nont_, c1]) # rule 5 lhs = LCFRS_lhs(nont_) lhs.add_arg([LCFRS_var(0, 0)]) lhs.add_arg([LCFRS_var(0, 1), LCFRS_var(1, 0)]) grammar.add_rule(lhs, [nont, c2]) grammar.make_proper() return grammar