def create_leaf_DCP_rule(bottom_max, dependency_label): """ Creates a DCP rule for a leaf of the recursive partitioning. Note that the linked LCFRS-rule has an empty RHS. If the corresponding node in the hybrid tree is not a leaf, i.e. bottom_max != [], there is exactly one inherited argument <0,0>. The only synthesized argument generates a DCP_term of the form "[0:{dependency_label}]( <0,0> ). Otherwise, there is no inherited argument, and the only synthesized argument generates a DCP_term of the form "[0:{dependency_label}]( )". :rtype: DCP_rule :param bottom_max: list of list of string :param dependency_label: dependency label linked to the corresponding terminal :return: DCP_rule """ if bottom_max: arg = 1 else: arg = 0 lhs = DCP_var(-1, arg) term_head = DCP_index(0, dependency_label) if bottom_max: term_arg = [DCP_var(-1, 0)] else: term_arg = [] rhs = [DCP_term(term_head, term_arg)] return DCP_rule(lhs, rhs)
def create_dcp_rule(mem, arg, top_max, bottom_max, children): """ Create DCP equation for some synthesized attributes of LHS nont or inherited attributes of RHS nont of an LCFRS-sDCP-hybrid rule. :rtype: DCP_rule :param mem: int (member part of attribute: -1 for LHS, >=0 for RHS) :param arg: int (argument part of attribute: >= 0) :param top_max: list of list of string (top_max of nont on LHS) :param bottom_max: list of list of string (bottom_max of nont on LHS) :param children: list of pairs of list of list of string (pair of (top_max, bottom_max) for every nont on RHS) :return: DCP_rule :raise Exception: """ lhs = DCP_var(mem, arg) rhs = [] if mem < 0: conseq_ids = top_max[arg - len(bottom_max)][:] else: conseq_ids = children[mem][1][arg][:] while conseq_ids: id = conseq_ids[0] if mem >= 0: c_index = -1 else: c_index = 0 match = False while c_index < len(children) and not match: if c_index >= 0: child = children[c_index] else: # If equation for inherited arguments of some nont on RHS is computed, # the inherited arguments of the LHS are used in addition. # The second component is empty, which allows for some magic below! child = (bottom_max, []) t_seq_index = 0 while t_seq_index < len(child[0]) and not match: t_seq = child[0][t_seq_index] # check if correct child synthesized attribute was found if id == t_seq[0]: # sanity check, is t_seq a prefix of conseq_ids if conseq_ids[:len(t_seq)] != t_seq: raise Exception # Append variable corresponding to synthesized attribute of nont on RHS. # Or, append variable corresponding to inherited attribute of nont on LHS, # where len(child[1]) evaluates to 0 as intended. rhs.append(DCP_var(c_index, len(child[1]) + t_seq_index)) # remove matched prefix from conseq_ids conseq_ids = conseq_ids[len(t_seq):] # exit two inner while loops match = True t_seq_index += 1 c_index += 1 # Sanity check, that attribute was matched: if not match: raise Exception('Expected ingredient for synthesised or inherited argument was not found.') return DCP_rule(lhs, rhs)
def direct_extract_lcfrs_from_prebinarized_corpus(tree, term_labeling=PosTerminals(), nont_labeling=BasicNonterminalLabeling(), isolate_pos=True): gram = LCFRS(start=START) root = tree.root[0] if root in tree.full_yield(): lhs = LCFRS_lhs(START) label = term_labeling.token_label(tree.node_token(root)) lhs.add_arg([label]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge()), [])]) gram.add_rule(lhs, [], dcp=[dcp_rule]) else: first, _, _ = direct_extract_lcfrs_prebinarized_recur(tree, root, gram, term_labeling, nont_labeling, isolate_pos) lhs = LCFRS_lhs(START) lhs.add_arg([LCFRS_var(0, 0)]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) gram.add_rule(lhs, [first], dcp=[dcp_rule]) return gram
def induce_grammar(trees, nont_labelling, term_labelling, recursive_partitioning, start_nont='START'): """ :rtype: LCFRS :param trees: corpus of HybridTree (i.e. list (or Generator for lazy IO)) :type trees: __generator[HybridTree] :type nont_labelling: AbstractLabeling :param term_labelling: HybridTree, NodeId -> str :param recursive_partitioning: HybridTree -> RecursivePartitioning :type start_nont: str :rtype: int, LCFRS Top level method to induce an LCFRS/DCP-hybrid grammar for dependency parsing. """ grammar = LCFRS(start_nont) n_trees = 0 for tree in trees: n_trees += 1 for rec_par in recursive_partitioning: match = re.search(r'no_new_nont', rec_par.__name__) if match: rec_par_int = rec_par(tree, grammar.nonts(), nont_labelling) else: rec_par_int = rec_par(tree) rec_par_nodes = tree.node_id_rec_par(rec_par_int) (_, _, nont_name) = add_rules_to_grammar_rec(tree, rec_par_nodes, grammar, nont_labelling, term_labelling) # Add rule from top start symbol to top most nonterminal for the hybrid tree lhs = LCFRS_lhs(start_nont) lhs.add_arg([LCFRS_var(0, 0)]) rhs = [nont_name] dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) grammar.add_rule(lhs, rhs, 1.0, [dcp_rule]) grammar.make_proper() return n_trees, grammar
def direct_extract_lcfrs_prebinarized_recur(tree, idx, gram, term_labeling, nont_labeling, isolate_pos): assert isinstance(tree, HybridDag) fringe = tree.fringe(idx) spans = join_spans(fringe) nont_fanout = len(spans) _bot = list(bottom(tree, [idx] + tree.descendants(idx))) _top = list(top(tree, [idx] + tree.descendants(idx))) nont = nont_labeling.label_nont(tree, idx) + '/' + '/'.join( map(str, [nont_fanout, len(_bot), len(_top)])) lhs = LCFRS_lhs(nont) if idx in tree.full_yield(): label = term_labeling.token_label(tree.node_token(idx)) lhs.add_arg([label]) dcp_rule = DCP_rule(DCP_var(-1, 0), [ DCP_term(DCP_index(0, edge_label=tree.node_token(idx).edge()), []) ]) gram.add_rule(lhs, [], dcp=[dcp_rule]) return lhs.nont(), _bot, _top if not len(tree.children(idx)) <= 2: raise ValueError("Tree is not prebinarized!", tree, idx) children = [(child, join_spans(tree.fringe(child))) for child in tree.children(idx)] edge_labels = [] for (low, high) in spans: arg = [] pos = low while pos <= high: child_num = 0 for i, (child, child_spans) in enumerate(children): for j, (child_low, child_high) in enumerate(child_spans): if pos == child_low: if child in tree.full_yield() and not isolate_pos: arg += [ term_labeling.token_label( tree.node_token(child)) ] edge_labels += [tree.node_token(child).edge()] else: arg += [LCFRS_var(child_num, j)] pos = child_high + 1 if child not in tree.full_yield() or isolate_pos: child_num += 1 lhs.add_arg(arg) dcp_term_args = [] rhs = [] nont_counter = 0 term_counter = 0 cbots = [] ctops = [] for (child, child_spans) in children: if child not in tree.full_yield() or isolate_pos: c_nont, _cbot, _ctop = direct_extract_lcfrs_prebinarized_recur( tree, child, gram, term_labeling, nont_labeling, isolate_pos) rhs.append(c_nont) cbots.append(_cbot) ctops.append(_ctop) dcp_term_args.append( DCP_var(nont_counter, len(_cbot) + _ctop.index(child))) nont_counter += 1 else: dcp_term_args.append( DCP_term( DCP_index(term_counter, edge_label=edge_labels[term_counter]), [])) term_counter += 1 for sec, sec_child in enumerate(tree.sec_children(idx)): if sec_child not in tree.descendants(idx): print(idx, "has external", sec_child) assert sec_child in _bot dcp_term_args.append( DCP_term(DCP_string("SECEDGE"), [DCP_var(-1, _bot.index(sec_child))])) else: print(idx, "has internal", sec_child) assert False dcp_lhs = DCP_var(-1, len(_bot) + _top.index(idx)) label = tree.node_token(idx).category() if re.match(r'.*\|<.*>', label): dcp_term = dcp_term_args else: dcp_term = [ DCP_term(DCP_string(label, edge_label=tree.node_token(idx).edge()), dcp_term_args) ] dcp_rule = DCP_rule(dcp_lhs, dcp_term) dcp_rules = [dcp_rule] for top_idx in _top: if top_idx != idx: # must be in some child rule = None for nont_counter, _ctop in enumerate(ctops): if top_idx in _ctop: rule = DCP_rule( DCP_var(-1, len(_bot) + _top.index(top_idx)), [ DCP_var( nont_counter, len(cbots[nont_counter]) + _ctop.index(top_idx)) ]) break assert rule is not None dcp_rules.append(rule) for nont_counter, _cbot in enumerate(cbots): for bot_idx in _cbot: rule = None rule_lhs = DCP_var(nont_counter, _cbot.index(bot_idx)) if bot_idx in _bot: rule = DCP_rule(rule_lhs, [DCP_var(-1, _bot.index(bot_idx))]) else: for nont_counter2, _ctop in enumerate(ctops): if bot_idx in _ctop: rule = DCP_rule(rule_lhs, [ DCP_var( nont_counter2, len(cbots[nont_counter2]) + _ctop.index(bot_idx)) ]) break assert rule is not None dcp_rules.append(rule) gram.add_rule(lhs, rhs, dcp=dcp_rules) return nont, _bot, _top