Esempio n. 1
0
def create_leaf_DCP_rule(bottom_max, dependency_label):
    """
    Creates a DCP rule for a leaf of the recursive partitioning.
    Note that the linked LCFRS-rule has an empty RHS.
    If the corresponding node in the hybrid tree is not a leaf, i.e. bottom_max != [],
    there is exactly one inherited argument <0,0>. The only synthesized argument
    generates a DCP_term of the form "[0:{dependency_label}]( <0,0> ).
    Otherwise, there is no inherited argument, and the only synthesized argument
    generates a DCP_term of the form "[0:{dependency_label}]( )".
    :rtype: DCP_rule
    :param bottom_max: list of list of string
    :param dependency_label: dependency label linked to the corresponding terminal
    :return: DCP_rule
    """
    if bottom_max:
        arg = 1
    else:
        arg = 0
    lhs = DCP_var(-1, arg)
    term_head = DCP_index(0, dependency_label)
    if bottom_max:
        term_arg = [DCP_var(-1, 0)]
    else:
        term_arg = []
    rhs = [DCP_term(term_head, term_arg)]
    return DCP_rule(lhs, rhs)
Esempio n. 2
0
def create_dcp_rule(mem, arg, top_max, bottom_max, children):
    """
    Create DCP equation for some synthesized attributes of LHS nont
    or inherited attributes of RHS nont of an LCFRS-sDCP-hybrid rule.
    :rtype: DCP_rule
    :param mem: int                            (member part of attribute: -1 for LHS, >=0 for RHS)
    :param arg: int                            (argument part of attribute: >= 0)
    :param top_max:    list of list of string  (top_max of nont on LHS)
    :param bottom_max: list of list of string  (bottom_max of nont on LHS)
    :param children: list of pairs of list of list of string
                                       (pair of (top_max, bottom_max) for every nont on RHS)
    :return: DCP_rule :raise Exception:
    """
    lhs = DCP_var(mem, arg)
    rhs = []
    if mem < 0:
        conseq_ids = top_max[arg - len(bottom_max)][:]
    else:
        conseq_ids = children[mem][1][arg][:]
    while conseq_ids:
        id = conseq_ids[0]
        if mem >= 0:
            c_index = -1
        else:
            c_index = 0
        match = False
        while c_index < len(children) and not match:
            if c_index >= 0:
                child = children[c_index]
            else:
                # If equation for inherited arguments of some nont on RHS is computed,
                # the inherited arguments of the LHS are used in addition.
                # The second component is empty, which allows for some magic below!
                child = (bottom_max, [])
            t_seq_index = 0
            while t_seq_index < len(child[0]) and not match:
                t_seq = child[0][t_seq_index]
                # check if correct child synthesized attribute was found
                if id == t_seq[0]:
                    # sanity check, is t_seq a prefix of conseq_ids
                    if conseq_ids[:len(t_seq)] != t_seq:
                        raise Exception
                    # Append variable corresponding to synthesized attribute of nont on RHS.
                    # Or, append variable corresponding to inherited attribute of nont on LHS,
                    # where len(child[1]) evaluates to 0 as intended.
                    rhs.append(DCP_var(c_index, len(child[1]) + t_seq_index))
                    # remove matched prefix from conseq_ids
                    conseq_ids = conseq_ids[len(t_seq):]
                    # exit two inner while loops
                    match = True
                t_seq_index += 1
            c_index += 1
        # Sanity check, that attribute was matched:
        if not match:
            raise Exception('Expected ingredient for synthesised or inherited argument was not found.')
    return DCP_rule(lhs, rhs)
def direct_extract_lcfrs_from_prebinarized_corpus(tree,
                                                  term_labeling=PosTerminals(),
                                                  nont_labeling=BasicNonterminalLabeling(),
                                                  isolate_pos=True):
    gram = LCFRS(start=START)
    root = tree.root[0]
    if root in tree.full_yield():
        lhs = LCFRS_lhs(START)
        label = term_labeling.token_label(tree.node_token(root))
        lhs.add_arg([label])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge()), [])])
        gram.add_rule(lhs, [], dcp=[dcp_rule])
    else:
        first, _, _ = direct_extract_lcfrs_prebinarized_recur(tree, root, gram, term_labeling, nont_labeling, isolate_pos)
        lhs = LCFRS_lhs(START)
        lhs.add_arg([LCFRS_var(0, 0)])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)])
        gram.add_rule(lhs, [first], dcp=[dcp_rule])
    return gram
Esempio n. 4
0
def induce_grammar(trees, nont_labelling, term_labelling, recursive_partitioning, start_nont='START'):
    """
    :rtype: LCFRS
    :param trees: corpus of HybridTree (i.e. list (or Generator for lazy IO))
    :type trees: __generator[HybridTree]
    :type nont_labelling: AbstractLabeling
    :param term_labelling: HybridTree, NodeId -> str
    :param recursive_partitioning: HybridTree -> RecursivePartitioning
    :type start_nont: str
    :rtype: int, LCFRS

    Top level method to induce an LCFRS/DCP-hybrid grammar for dependency parsing.
    """
    grammar = LCFRS(start_nont)
    n_trees = 0
    for tree in trees:
        n_trees += 1
        for rec_par in recursive_partitioning:
            match = re.search(r'no_new_nont', rec_par.__name__)
            if match:
                rec_par_int = rec_par(tree, grammar.nonts(), nont_labelling)
            else:
                rec_par_int = rec_par(tree)

            rec_par_nodes = tree.node_id_rec_par(rec_par_int)

            (_, _, nont_name) = add_rules_to_grammar_rec(tree, rec_par_nodes, grammar, nont_labelling, term_labelling)

            # Add rule from top start symbol to top most nonterminal for the hybrid tree
            lhs = LCFRS_lhs(start_nont)
            lhs.add_arg([LCFRS_var(0, 0)])
            rhs = [nont_name]
            dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)])

            grammar.add_rule(lhs, rhs, 1.0, [dcp_rule])

    grammar.make_proper()
    return n_trees, grammar
Esempio n. 5
0
def direct_extract_lcfrs_prebinarized_recur(tree, idx, gram, term_labeling,
                                            nont_labeling, isolate_pos):
    assert isinstance(tree, HybridDag)
    fringe = tree.fringe(idx)
    spans = join_spans(fringe)
    nont_fanout = len(spans)

    _bot = list(bottom(tree, [idx] + tree.descendants(idx)))
    _top = list(top(tree, [idx] + tree.descendants(idx)))

    nont = nont_labeling.label_nont(tree, idx) + '/' + '/'.join(
        map(str, [nont_fanout, len(_bot), len(_top)]))

    lhs = LCFRS_lhs(nont)

    if idx in tree.full_yield():
        label = term_labeling.token_label(tree.node_token(idx))
        lhs.add_arg([label])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [
            DCP_term(DCP_index(0, edge_label=tree.node_token(idx).edge()), [])
        ])
        gram.add_rule(lhs, [], dcp=[dcp_rule])
        return lhs.nont(), _bot, _top

    if not len(tree.children(idx)) <= 2:
        raise ValueError("Tree is not prebinarized!", tree, idx)

    children = [(child, join_spans(tree.fringe(child)))
                for child in tree.children(idx)]
    edge_labels = []
    for (low, high) in spans:
        arg = []
        pos = low
        while pos <= high:
            child_num = 0
            for i, (child, child_spans) in enumerate(children):
                for j, (child_low, child_high) in enumerate(child_spans):
                    if pos == child_low:
                        if child in tree.full_yield() and not isolate_pos:
                            arg += [
                                term_labeling.token_label(
                                    tree.node_token(child))
                            ]
                            edge_labels += [tree.node_token(child).edge()]
                        else:
                            arg += [LCFRS_var(child_num, j)]
                        pos = child_high + 1
                if child not in tree.full_yield() or isolate_pos:
                    child_num += 1
        lhs.add_arg(arg)

    dcp_term_args = []
    rhs = []
    nont_counter = 0
    term_counter = 0

    cbots = []
    ctops = []

    for (child, child_spans) in children:

        if child not in tree.full_yield() or isolate_pos:
            c_nont, _cbot, _ctop = direct_extract_lcfrs_prebinarized_recur(
                tree, child, gram, term_labeling, nont_labeling, isolate_pos)
            rhs.append(c_nont)
            cbots.append(_cbot)
            ctops.append(_ctop)
            dcp_term_args.append(
                DCP_var(nont_counter,
                        len(_cbot) + _ctop.index(child)))
            nont_counter += 1
        else:
            dcp_term_args.append(
                DCP_term(
                    DCP_index(term_counter,
                              edge_label=edge_labels[term_counter]), []))
            term_counter += 1

    for sec, sec_child in enumerate(tree.sec_children(idx)):
        if sec_child not in tree.descendants(idx):
            print(idx, "has external", sec_child)
            assert sec_child in _bot
            dcp_term_args.append(
                DCP_term(DCP_string("SECEDGE"),
                         [DCP_var(-1, _bot.index(sec_child))]))

        else:
            print(idx, "has internal", sec_child)

            assert False

    dcp_lhs = DCP_var(-1, len(_bot) + _top.index(idx))

    label = tree.node_token(idx).category()
    if re.match(r'.*\|<.*>', label):
        dcp_term = dcp_term_args
    else:
        dcp_term = [
            DCP_term(DCP_string(label, edge_label=tree.node_token(idx).edge()),
                     dcp_term_args)
        ]
    dcp_rule = DCP_rule(dcp_lhs, dcp_term)

    dcp_rules = [dcp_rule]

    for top_idx in _top:
        if top_idx != idx:
            # must be in some child
            rule = None

            for nont_counter, _ctop in enumerate(ctops):
                if top_idx in _ctop:
                    rule = DCP_rule(
                        DCP_var(-1,
                                len(_bot) + _top.index(top_idx)), [
                                    DCP_var(
                                        nont_counter,
                                        len(cbots[nont_counter]) +
                                        _ctop.index(top_idx))
                                ])

                    break
            assert rule is not None
            dcp_rules.append(rule)

    for nont_counter, _cbot in enumerate(cbots):
        for bot_idx in _cbot:
            rule = None
            rule_lhs = DCP_var(nont_counter, _cbot.index(bot_idx))

            if bot_idx in _bot:
                rule = DCP_rule(rule_lhs, [DCP_var(-1, _bot.index(bot_idx))])
            else:
                for nont_counter2, _ctop in enumerate(ctops):
                    if bot_idx in _ctop:
                        rule = DCP_rule(rule_lhs, [
                            DCP_var(
                                nont_counter2,
                                len(cbots[nont_counter2]) +
                                _ctop.index(bot_idx))
                        ])
                        break
            assert rule is not None
            dcp_rules.append(rule)

    gram.add_rule(lhs, rhs, dcp=dcp_rules)

    return nont, _bot, _top