Python Node Examples, munge.penn.aug_nodes.Node Python Examples

Example #1

0

Show file

File: binarise.py Project: Oneplus/cnccgbank

def reshape_for_coordination(node, inside_np_internal_structure):
    if node.count() >= 3:
        # (XP PU) (CC XP)
        # if we get contiguous PU CC, associate the PU with the previous conjunct
        # but:
        # XP (PU XP) (CC XP)
        # XP (PU XP PU) (CC XP)
        # the rule is:
        # attach PU to the right _unless_ it is followed by CC
        
        kid_tag = base_tag(node.tag, strip_cptb_tag=False)
        
        kids = node.kids
        
        seen_cc = False
        last_kid, seen_cc = get_kid(kids, seen_cc)
        second_last_kid, seen_cc = get_kid(kids, seen_cc)
        
        cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1)
        
        while kids:
            kid, seen_cc = get_kid(kids, seen_cc)
            cur = Node(kid_tag, [kid, cur], head_index=1)
        
        cur.tag = node.tag
        return cur
    
    return label_adjunction(node, inside_np_internal_structure=inside_np_internal_structure, do_labelling=False)

Example #2

0

Show file

    def clusterfix(self, top, pp, p, s, t):
        debug("Fixing argument cluster coordination: %s", pprint(top))
        debug('T: %s', t)
        # 1. Shrink the verb (node T)
        self.fix_object_gap(pp, p, t, s)
        # 2. Reattach the verb above the TOP node
        new_node = Node('TAG', top.kids, top.category, head_index=0)
        top.kids = [t, new_node]
        # (Reattaching parent pointers)
        for kid in new_node:
            kid.parent = new_node

        # 3. Find and relabel argument clusters
        for node, ctx in find_all(top,
                                  r'/VP/=VP <1 /NP/=NP <2 /(QP|V[PV])/=QP',
                                  with_context=True):
            vp, np, qp = ctx.vp, ctx.np, ctx.qp
            # Now, VP should have category ((S[dcl]\NP)/QP)/NP
            SbNP = t.category.left.left
            QP, NP = qp.category, np.category
            # NP should have category ((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP)
            new_np_category = (SbNP / QP) | ((SbNP / QP) / NP)
            # QP should have category ((S[dcl]\NP)\((S[dcl]\NP)/QP))
            new_qp_category = (SbNP) | ((SbNP) / QP)

            # insert unary nodes
            new_np_node = Node(np.tag, [np], new_np_category, head_index=0)
            np.parent = new_np_node
            new_qp_node = Node(qp.tag, [qp], new_qp_category, head_index=0)
            qp.parent = new_qp_node

            replace_kid(vp, np, new_np_node)
            replace_kid(vp, qp, new_qp_node)

            self.fix_categories_starting_from(new_np_node, top)

Example #3

0

Show file

File: binarise.py Project: Oneplus/cnccgbank

def label_head_initial(node, inherit_tag=False):
    if has_tag(node, 'c'): inherit_tag=False
    kid_tag = strip_tag_if(not inherit_tag, node.tag)
    
    kids = map(label_node, node.kids)[::-1]
    first_kid, second_kid = twice(kids.pop)()
    
    cur = Node(kid_tag, [first_kid, second_kid], head_index=0)
    
    while kids:
        kid = kids.pop()
        cur = Node(kid_tag, [cur, kid], head_index=0)
    
    cur.tag = node.tag
    return cur

Example #4

0

Show file

File: binarise.py Project: Oneplus/cnccgbank

def label_predication(node, inherit_tag=False):
    kid_tag = strip_tag_if(not inherit_tag, node.tag)
    
    kids = map(label_node, node.kids)
    last_kid, second_last_kid = twice(get_kid_)(kids)
    
    cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1)
    
    while kids:
        kid = get_kid_(kids)
        cur = Node(kid_tag, [kid, cur], head_index=1)
    
    cur.tag = node.tag # restore the full tag at the topmost level
    
    return cur

Example #5

0

Show file

 def fix_ip_app(self, p, a, s):
     debug("Fixing IP-APP NX: %s", lrp_repr(p))
     new_kid = copy(a)
     new_kid.tag = base_tag(
         new_kid.tag)  # relabel to stop infinite matching
     replace_kid(
         p, a, Node("NN", [new_kid], s.category / s.category, head_index=0))

Example #6

0

Show file

    def fix_nongap_extraction(self, _, n, pred, k):
        node = n
        debug("Fixing nongap extraction: %s", pprint(node))
        debug("k %s", pprint(k))
        self.remove_null_element(node)

        index = get_trace_index_from_tag(k.tag)
        expr = (
            r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }'
            % {
                'tags': ModifierTagsRegex,
                'index': index
            })

        # we use "<<" in the expression, because fix_*_topicalisation comes
        # before fix_nongap_extraction, and this can introduce an extra layer between
        # the phrasal tag and the trace
        for trace_NP, ctx in find_all(node, expr, with_context=True):
            pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s

            # remove T from P
            # replace P with S
            self.fix_object_gap(pp, p, t, s)

            if not self.relabel_relativiser(pred):
                top, context = get_first(node,
                                         r'/[ICV]P/=TOP $ *=SS',
                                         with_context=True)
                ss = context.ss

                debug("Creating null relativiser unary category: %s",
                      ss.category / ss.category)
                replace_kid(
                    top.parent, top,
                    Node("NN", [top], ss.category / ss.category, head_index=0))

Example #7

0

Show file

File: binarise.py Project: Oneplus/cnccgbank

def label_apposition(node, inherit_tag=False, inside_np_internal_structure=False):
    kid_tag = strip_tag_if(not inherit_tag, node.tag)
    
    kids = map(lambda node: label_node(node, inside_np_internal_structure=inside_np_internal_structure), node.kids)
    last_kid = get_kid_(kids)
    if kids:
        second_last_kid = get_kid_(kids)
        cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1)    
    else:
        cur = last_kid

    while kids:
        kid = get_kid_(kids)
        cur = Node(kid_tag, [kid, cur], head_index=1)
    
    cur.tag = node.tag
    return cur

Example #8

0

Show file

    def fix_topicalisation_without_gap(self, node, p, s, t):
        debug("Fixing topicalisation without gap: %s", pprint(node))

        new_kid = copy(t)
        new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False)

        new_category = featureless(p.category) / featureless(s.category)
        replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))

Example #9

0

Show file

    def fix_subject_extraction(self, _, n, pred, w=None, reduced=False):
        global use_bare_N

        debug("%s", reduced)
        node = n
        debug("Fixing subject extraction: %s", lrp_repr(node))

        # We only want this if we are using the N -> NP unary rule
        # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N)
        if use_bare_N and pred.tag.startswith('NP'):
            # Fix for the NP(VP de) case:
            # ---------------------------
            #         NP                 NP
            #        /  \                |
            #      WHNP  CP     -->      CP
            #            / \            /  \
            #          IP  DEC         IP   DEC
            if not pred.is_leaf():
                pred.kids.pop(0)
                pred.head_index = 0
        else:
            if not reduced:
                self.remove_null_element(node)

        if w:
            index = get_trace_index_from_tag(w.tag)
        else:
            index = ''

        expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index

        for trace_NP, ctx in find_all(node, expr, with_context=True):
            pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s

            self.fix_object_gap(pp, p, t, s)
            self.fix_categories_starting_from(s, until=node)

            if not self.relabel_relativiser(pred):
                # TOP is the shrunk VP
                # after shrinking, we can get VV or VA here
                # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7))
                result = get_first(
                    node,
                    r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/',
                    with_context=True,
                    left_to_right=True)
                if not result:
                    debug(
                        'Could not find verbal category; did not create null relativiser.'
                    )
                    return

                top, context = result
                SS = context.ss.category

                debug("Creating null relativiser unary category: %s", SS / SS)
                replace_kid(top.parent, top,
                            Node("NN", [top], SS / SS, head_index=0))

Example #10

0

Show file

    def fix_object_extraction(self, _, n, pred, w=None, reduced=False):
        global use_bare_N

        node = n
        debug("Fixing object extraction: %s", lrp_repr(node))

        # We only want this if we are using the N -> NP unary rule
        # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N)
        if use_bare_N and pred.tag.startswith('NP'):
            # Fix for the NP(VP de) case:
            # ---------------------------
            #         NP                 NP
            #        /  \                |
            #      WHNP  CP     -->      CP
            #            / \            /  \
            #          IP  DEC         IP   DEC
            if not pred.is_leaf():
                pred.kids.pop(0)
                pred.head_index = 0
        else:
            if not reduced:
                self.remove_null_element(node)

        if w:
            index = get_trace_index_from_tag(w.tag)
        else:
            index = ''

        expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index

        for trace_NP, ctx in find_all(node, expr, with_context=True):
            top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s

            self.fix_object_gap(pp, p, t, s)
            self.fix_categories_starting_from(s, until=top)

            # If we couldn't find the DEC node, this is the null relativiser case
            if not self.relabel_relativiser(pred):
                # TOP is the S node
                # null relativiser category comes from sibling of TOP
                # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9))
                result = get_first(top,
                                   r'* $ *=SS',
                                   with_context=True,
                                   nonrecursive=True)
                if result:
                    _, ctx = result
                    ss = ctx.ss
                    debug("Creating null relativiser unary category: %s",
                          ss.category / ss.category)
                    replace_kid(
                        top.parent, top,
                        Node("NN", [top],
                             ss.category / ss.category,
                             head_index=0))

Example #11

0

Show file

    def fix_modification(self, node, p, s, t):
        debug("Fixing modification: %s", lrp_repr(node))
        S, P = s.category, p.category

        # If you don't strip the tag :m from the newly created child (new_kid),
        # the fix_modification pattern will match infinitely when tgrep visits new_kid
        new_kid = copy(t)
        new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False)

        new_category = featureless(P) / featureless(S)
        debug("Creating category %s", new_category)
        replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))

Example #12

0

Show file

File: binarise.py Project: Oneplus/cnccgbank

def label_adjunction(node, inherit_tag=False, do_labelling=True, inside_np_internal_structure=False):
    kid_tag = strip_tag_if(not inherit_tag, node.tag)
    
    if do_labelling:
        kids = map(lambda node: label_node(node, inside_np_internal_structure=inside_np_internal_structure), node.kids)
    else:
        kids = node.kids
    
#    last_kid, second_last_kid = twice(kids.pop)()
    last_kid = get_kid_(kids)
    if kids:
        second_last_kid = get_kid_(kids)
        cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1)    
    else:
        cur = last_kid

    while kids:
        kid = get_kid_(kids)
        cur = Node(kid_tag, [kid, cur], head_index=1)
    
    cur.tag = node.tag
    return cur

Example #13

0

Show file

    def accept_derivation(self, bundle):
        for node, ctx in find_all(bundle.derivation, expr, with_context=True):
            u = ctx.n.lex.decode('u8')
            if u[0] in baixing:
                leaf = ctx.n
                kids = [
                    Leaf(leaf.tag, u[0].encode('u8'), None),
                    Leaf(leaf.tag, u[1:].encode('u8'), None)
                ]
                replace_kid(ctx.n.parent, ctx.n, Node('NR', kids))
                #node.kids = kids

        self.write_derivation(bundle)

Example #14

0

Show file

    def fix_rnr(self, rnr, g):
        # G is the node dominating all the conjuncts
        rnr_tags = []
        for node, ctx in find_all(g, r'/:c/a', with_context=True):
            for rnr in find_all(node, r'^/\*RNR\*/'):
                rnr_tags.append(get_trace_index_from_tag(rnr.lex))

        for index in rnr_tags:
            for node, ctx in find_all(
                    g,
                    r'*=PP < { *=P < { *=T < ^/\*RNR\*%s/ $ *=S } }' % index,
                    with_context=True):
                inherit_tag(ctx.s, ctx.p)
                self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s)
                self.fix_categories_starting_from(ctx.s, g)

        # This breaks with the IP (LC CC LC) case in 9:19(11) -- last_conjunct returns None
        # because the last conjunct has been shrunk
        last_conjunct = list(find_first(g, r'/:c/a', left_to_right=False))

        args = []
        # Here, we uniquify the rnr tags so that we excise each shared argument only once
        for index in set(rnr_tags):
            # find_first, because we only want to find one match, the shallowest.
            # cf 7:27(10), if NP-OBJ-2(NN NP-OBJ-2(JJ NN)), then we only want to identify
            # one matching node for index -2 -- the shallowest -- and not two.
            for node, ctx in find_first(last_conjunct[0],
                                        r'*=P < { /%s/a=T $ *=S }' % index,
                                        with_context=True):
                args.append(ctx.t)

                # Note: last_conjunct may be disconnected from
                # the tree by replace_kid (when ctx.p == last_conjunct)
                replace_kid(ctx.p.parent, ctx.p, ctx.s)
                self.fix_categories_starting_from(ctx.s, g)

        # Because the find_all which retrieved the args is an in-order left-to-right traversal, it will find
        # shallower nodes before deeper nodes. Therefore, if a verb has two args V A1 A2, the _args_ list will
        # contain [A2, A1] because A2 is shallower (further from the head) than A1.
        # We reverse the list of args, so that args are re-attached from the inside out (starting from A1).
        # args.reverse()

        new_g = g
        for arg in args:
            new_g = Node(new_g.tag, [new_g, arg],
                         new_g.category.left,
                         head_index=0)
            arg.parent = new_g

        replace_kid(g.parent, g, new_g)

Example #15

0

Show file

    def fix_whword_topicalisation(self, node, p, s, t):
        debug('Fixing wh-word topicalisation: node: %s', lrp_repr(node))
        # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5))
        t.tag = base_tag(t.tag, strip_cptb_tag=False)
        # create topicalised category based on the tag of T
        typeraise_t_category = ptb_to_cat(t)
        # insert a node with the topicalised category
        replace_kid(
            p, t,
            Node(base_tag(t.tag, strip_cptb_tag=False), [t],
                 typeraise(typeraise_t_category, SbNP, TR_TOPICALISATION),
                 head_index=0))

        index = get_trace_index_from_tag(t.tag)

        expr = r'*=PP < { /VP/=P < { /NP-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } }' % index

        for top, ctx in find_all(p, expr, with_context=True):
            replace_kid(ctx.pp, ctx.p, ctx.s)
            self.fix_categories_starting_from(ctx.s, until=top)

Example #16

0

Show file

    def fix_topicalisation_with_gap(self, node, p, s, t):
        debug("Fixing topicalisation with gap:\nnode=%s\ns=%s\nt=%s",
              lrp_repr(node), pprint(s), pprint(t))

        # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5))
        t.tag = base_tag(t.tag, strip_cptb_tag=False)
        # create topicalised category based on the tag of T
        typeraise_t_category = ptb_to_cat(t)
        # insert a node with the topicalised category
        replace_kid(
            p, t,
            Node(base_tag(t.tag, strip_cptb_tag=False), [t],
                 typeraise(typeraise_t_category, S, TR_TOPICALISATION),
                 head_index=0))

        index = get_trace_index_from_tag(t.tag)

        # attested gaps:
        # 575 IP-TPC:t
        # 134 NP-TPC:t
        #  10 IP-Q-TPC:t
        #   8 CP-TPC:t
        #   4 NP-PN-TPC:t
        #   2 QP-TPC:t
        #   2 NP-TTL-TPC:t
        #   1 PP-TPC:t
        #   1 IP-IJ-TPC:t
        #   1 INTJ-TPC:t
        #   1 CP-Q-TPC:t
        #   1 CP-CND-TPC:t
        expr = r'/IP/=TOP << { *=PP < { *=P < { /[NICQP]P-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } } }' % index

        for top, ctx in find_all(s, expr, with_context=True):
            debug('top: %s', pprint(top))
            self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s)
            self.fix_categories_starting_from(ctx.s, until=top)

Example #17

0

Show file

    def fix_categories_starting_from(self, node, until):
        '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct
CCG analysis.'''
        while node is not until:
            # Only fix binary rules
            if (not node.parent) or node.parent.count() < 2: break

            l, r, p = node.parent[0], node.parent[1], node.parent
            L, R, P = (n.category for n in (l, r, p))
            debug("L: %s R: %s P: %s", L, R, P)

            applied_rule = analyse(L, R, P)
            debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R,
                  ''.join(r.text()), P, ''.join(p.text()), applied_rule)

            if applied_rule is None:
                debug("invalid rule %s %s -> %s", L, R, P)

                if R.is_complex() and R.left.is_complex(
                ) and L == R.left.right:
                    # L       (X|L)|Y -> X|Y becomes
                    # X|(X|L) (X|L)|Y -> X|Y
                    T = R.left.left
                    new_category = typeraise(L, T, TR_FORWARD)  #T/(T|L)
                    node.parent[0] = Node(l.tag, [l],
                                          new_category,
                                          head_index=0)

                    new_parent_category = fcomp(new_category, R)
                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category

                    debug("New category: %s", new_category)

                elif L.is_complex() and L.left.is_complex(
                ) and R == L.left.right:
                    # (X|R)|Y R       -> X|Y  becomes
                    # (X|R)|Y X|(X|R) -> X|Y
                    T = L.left.left
                    new_category = typeraise(R, T, TR_BACKWARD)  #T|(T/R)
                    node.parent[1] = Node(r.tag, [r],
                                          new_category,
                                          head_index=0)

                    new_parent_category = bxcomp(L, new_category)
                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category

                    debug("New category: %s", new_category)

                # conj R -> P
                # Make P into R[conj]
                # L cannot be the comma category (,), otherwise we get a mis-analysis
                # in 2:22(5)
                if str(L) in ('conj', 'LCM'):
                    p.category = R.clone_adding_feature('conj')
                    debug("New category: %s", p.category)

                # L R[conj] -> P
                elif R.has_feature('conj'):
                    new_L = L.clone()

                    r.category = new_L.clone_adding_feature('conj')
                    p.category = new_L

                    debug("New category: %s", new_L)

                elif L.is_leaf():
                    # , R -> P[conj] becomes , R -> R[conj]
                    if P.has_feature('conj') and l.tag in (
                            'PU', 'CC'):  # treat as partial coordination
                        debug("Fixing coordination: %s" % P)
                        p.category = r.category.clone_adding_feature('conj')
                        debug("new parent category: %s" % p.category)

                    # , R -> P becomes , R -> R
                    elif l.tag == "PU" and not P.has_feature(
                            'conj'):  # treat as absorption
                        debug("Fixing left absorption: %s" % P)
                        p.category = r.category

                    # L       (X|L)|Y -> X|Y becomes
                    # X|(X|L) (X|L)|Y -> X|Y
                    elif R.is_complex() and R.left.is_complex(
                    ) and L == R.left.right:
                        T = R.left.left
                        new_category = typeraise(L, T, TR_FORWARD)  #T/(T|L)
                        node.parent[0] = Node(l.tag, [l],
                                              new_category,
                                              head_index=0)

                        new_parent_category = fcomp(new_category, R)
                        if new_parent_category:
                            debug("new parent category: %s",
                                  new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)

                elif R.is_leaf():
                    # R , -> P becomes R , -> R
                    if r.tag == "PU":  # treat as absorption
                        debug("Fixing right absorption: %s" % P)
                        p.category = l.category

                    # (X|R)|Y R       -> X|Y  becomes
                    # (X|R)|Y X|(X|R) -> X|Y
                    elif L.is_complex() and L.left.is_complex(
                    ) and R == L.left.right:
                        T = L.left.left
                        new_category = typeraise(R, T, TR_BACKWARD)  #T|(T/R)
                        node.parent[1] = Node(r.tag, [r],
                                              new_category,
                                              head_index=0)

                        new_parent_category = bxcomp(L, new_category)
                        if new_parent_category:
                            debug("new parent category: %s",
                                  new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)

                else:
                    new_parent_category = None

                    # try typeraising fix
                    # T/(T/X) (T\A)/X -> T can be fixed:
                    # (T\A)/((T\A)/X) (T\A)/X -> T\A
                    if self.is_topicalisation(L) and (L.right.right == R.right
                                                      and P == L.left
                                                      and P == R.left.left):
                        T_A = R.left
                        X = R.right

                        l.category = T_A / (T_A / X)
                        new_parent_category = T_A

                    # (X|X)|Z Y       -> X becomes
                    # (X|X)|Z X|(X|X) -> X|Z
                    elif L.is_complex() and L.left.is_complex(
                    ) and R == L.left.right:
                        T = L.left.left
                        new_category = typeraise(
                            R, R, TR_BACKWARD, strip_features=False)  #T/(T|L)
                        node.parent[1] = Node(r.tag, [r],
                                              new_category,
                                              head_index=0)

                        new_parent_category = bxcomp(L, new_category)
                        if new_parent_category:
                            debug("new parent category: %s",
                                  new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)

                    # Generalise over right modifiers of verbal categories (S[dcl]\X)$
                    elif self.is_verbal_category(
                            L) and L.is_complex() and L.left.is_complex():
                        T = L.left.right
                        new_category = typeraise(R, T, TR_BACKWARD)
                        debug('Trying out %s', new_category)

                        if bxcomp(L, new_category):
                            node.parent[1] = Node(r.tag, [r],
                                                  new_category,
                                                  head_index=0)
                            new_parent_category = bxcomp(L, new_category)

                    # Last ditch: try all of the composition rules to generalise over L R -> P
                    if not new_parent_category:
                        # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3))
                        # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category
                        new_parent_category = (
                            fcomp(L, R)
                            or bcomp(L, R, when=not self.is_relativiser(R))
                            or bxcomp(
                                L, R, when=not self.is_relativiser(R)
                            )  #or bxcomp2(L, R, when=self.is_verbal_category(L)) 
                            or fxcomp(L, R, when=not self.is_relativiser(R)))

                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category
                    else:
                        debug("couldn't fix, skipping")

            node = node.parent
            debug('')

Example #18

0

Show file

File: tag.py Project: Oneplus/cnccgbank

def preprocess(root):
    # IP < PP PU -> PP < PP PU (20:58(1))
    if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag

    for node in nodes(root):
        if node.is_leaf(): continue

        if rewrite_lcp_as_np and node.tag.startswith('LCP'):
            node.tag = node.tag.replace('LCP', 'NP')

        first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False)
        last_kid,  last_kid_index  = get_nonpunct_kid(node, get_last=True)
        # ---------------------
        # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP)
        if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)):
            lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node)
            rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node)
            if rqu != node.count()-1:
                quoted_kids = node.kids[lqu:rqu+1]
                del node.kids[lqu:rqu+1]

                last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True)
                # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore
                if last_nonpunct_kid:
                    quoted_node = Node(last_nonpunct_kid.tag, quoted_kids)
                    node.kids.insert(lqu, quoted_node)

        # CPTB/Chinese-specific fixes
        # ---------------------------
        # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP))
        if first_kid and first_kid.tag == "P" and node.count() > 2:
            last_tag = last_kid.tag
            rest = node.kids[1:]
            del node.kids[1:]
            node.kids.append(Node(last_tag, rest, node))
        # 2:12(3). DNP-PRD fixed by adding a layer of NP
        elif (node.tag.startswith('VP') and node.count() == 2 and
                node[0].tag.startswith('VC') and
                node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node)
        # fix missing -OBJ tag from VP object complements (c.f. 31:18(4))
        elif (node.tag.startswith('VP') and node.count() >= 2 and
              node.tag.startswith('VP') and
              node[0].tag == 'VV' and
              node[-1].tag == 'NP'): node[-1].tag += "-OBJ"
        # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5))
        elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')):
            node.kids = node[0].kids
        # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier,
        # just like a quantity
        elif node.tag == 'DP-SBJ':
            node.tag = 'QP-SBJ'
        # attach the PU preceding a PRN under the PRN
        elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1:
            maybe_pu = node[last_kid_index-1]
            if maybe_pu.tag == 'PU':
                del node.kids[last_kid_index-1]
                last_kid.kids.insert(0, maybe_pu) # prepend
        # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC.
        elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG':
            if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'):
                node[1].tag = 'DEC'

        elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node):
            for kid in node:
                if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP')

        # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis
        elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'):
            np_app, np_pn = node[1], node[2]
            del node.kids[1:]

            node.kids.append(Node(node.tag, [np_app, np_pn], node))

        # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19))
        elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP':
            advp = node.kids.pop(1)
            # VP is the new node[1]
            # now replace node[1] with Node(node[1])
            node[1] = Node(node[1].tag, [advp, node[1]], node)

        # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h)
        # only 3 cases: 23:61(5), 9:14(14), 21:3(11)
        elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG':
            replace_kid(node, node[0], Node('NP', [node[0]]))

        elif is_vnv(node) and node.count() == 3:
            # Re-analyse VNV as coordination
            node[1].tag = 'CC'

        # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1))
        elif node.count() == 1:
            # fix IP < VP by adding *pro*
            if node.tag.startswith('IP') and node[0].tag.startswith('VP'):
                leaf = Leaf('-NONE-', '*pro*', None)
                pro = Node('NP-SBJ', [leaf])

                node.kids.insert(0, pro)
            elif node[0].tag == 'JJ':
                if node.tag.startswith('ADVP'):
                    node.tag = node.tag.replace('ADVP', 'ADJP')
                elif node.tag.startswith('NP'):
                    node.tag = node.tag.replace('NP', 'ADJP')

            # fix NP < VV
            elif node.tag == 'NP' and node[0].tag == 'VV':
                node.tag = node.tag.replace('NP', 'VP')

            # fix NP < ADJP < JJ (5:35(1))
            elif node.tag == 'NP' and node[0].tag == 'ADJP':
                replace_kid(node.parent, node, node[0])

            # fix projections NP < QP
            elif node[0].tag.startswith('QP') and node.tag.startswith('NP'):
                inherit_tag(node[0], node) # copy PCTB tags from NP to QP
                node.tag = node[0].tag # copy QP to parent, replacing NP
                node.kids = node[0].kids
            elif node[0].tag == 'IP' and node.tag == 'CP-APP':
                inherit_tag(node[0], node)
                node.tag = node[0].tag
                node.kids = node[0].kids
            # CLP < NN
            elif node[0].tag == 'NN' and node.tag == 'CLP':
                node[0].tag = 'M'
            elif node[0].tag == 'NN' and node.tag.startswith("VP"):
                node[0].tag = 'VV'
            elif node[0].tag == 'CP':
                if node.tag == 'NP-PRD':
                    node.kids = node[0].kids
                else:
                    # Rewrite NP < { CP < { CP < DEC } } 
                    # (i.e. 比 报告 的 早 一点) so that it's headed by the 的
                    expr = r'''/CP/ < { /CP/ < /DEC/ }'''
                    if get_first(node[0], expr):
                        node.kids = node[0].kids
                        
            elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN':
                node.kids = node[0].kids
                
            # ADVP < CS: shrink so that CS will be considered the head by binarise
            # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47)
            elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or  
                  (node[0].tag == 'M' and node.tag == 'CP')):
                replace_kid(node.parent, node, node[0])
                
            # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4)
            elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'):
                node.kids = node[0].kids

            # elif node.tag == 'VP' and node[0].tag == 'NP-PRD':
            #     replace_kid(node.parent, node, node[0])
            
            # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3)
            # elif node.tag == 'VP' and node[0].tag.startswith('IP'):
            #     replace_kid(node.parent, node, node[0])
                
        # Reshape LB (long bei)
        # ---------------------
        elif first_kid and first_kid.tag == "LB":
            expr = r'''* < { /LB/=LB
                       [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP
                       | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }'''
            top, ctx = get_first(node, expr, with_context=True)

            lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip
            top.kids = [lb, Node('IP', [sbj, pred])]
            # top.kids = [lb, sbj, pred]
            
        # elif False:
        elif first_kid and first_kid.tag == "BA":
            expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }'''
                
            result = get_first(node, expr, with_context=True)
            if result:
                top, ctx = result

                lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip
    #            top.kids = [lb, Node('IP', [sbj, pred])]
                top.kids = [lb, sbj, pred]

        # single mistagging CP-SBJ for CP in 24:58(1)
        elif node.tag == 'CP-SBJ': node.tag = 'CP'
        
        else:
            # Fix missing phrasal layer in NP < NN DEG (21:10(4))
            result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix missing phrasal layer in LCP < NN LC (11:17(9))
            result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix wrongly attached DEC (5:26(6))
            result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True)
            if result:
                _, ctx = result
                top, p, dec = ctx.top, ctx.p, ctx.dec

                top.kids.append(dec)
                p.kids.remove(dec)

            result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True)
            if result:
                _, ctx = result
                pp, p, s = ctx.pp, ctx.p, ctx.s
                inherit_tag(s, p)
                replace_kid(pp, p, s)

            expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred

                del vp.kids
                if get_first(sbj, r'* < ^/\*PRO\*/'):
                    vp.kids = [v, pred]
                else:
                    vp.kids = [v, sbj, pred]

            expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                p = ctx.p

                if p.count() <= 3: continue

                cd_cc_cd, rest = p.kids[0:3], p.kids[3:]
                del p.kids[0:3]

                new_node = Node('QP', cd_cc_cd)
                p.kids.insert(0, new_node)

    return root