Example #1
0
    def fix_nongap_extraction(self, _, n, pred, k):
        node = n
        debug("Fixing nongap extraction: %s", pprint(node))
        debug("k %s", pprint(k))
        self.remove_null_element(node)

        index = get_trace_index_from_tag(k.tag)
        expr = (r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }' 
             % { 'tags': ModifierTagsRegex, 'index': index })

        # we use "<<" in the expression, because fix_*_topicalisation comes
        # before fix_nongap_extraction, and this can introduce an extra layer between
        # the phrasal tag and the trace
        for trace_NP, ctx in find_all(node, expr, with_context=True):
            pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s

            # remove T from P
            # replace P with S
            self.fix_object_gap(pp, p, t, s)

            if not self.relabel_relativiser(pred):
                top, context = get_first(node, r'/[ICV]P/=TOP $ *=SS', with_context=True)
                ss = context.ss

                debug("Creating null relativiser unary category: %s", ss.category/ss.category)
                replace_kid(top.parent, top, Node("NN", [top], ss.category/ss.category, head_index=0))
Example #2
0
    def fix_nongap_extraction(self, _, n, pred, k):
        node = n
        debug("Fixing nongap extraction: %s", pprint(node))
        debug("k %s", pprint(k))
        self.remove_null_element(node)

        index = get_trace_index_from_tag(k.tag)
        expr = (
            r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }'
            % {
                'tags': ModifierTagsRegex,
                'index': index
            })

        # we use "<<" in the expression, because fix_*_topicalisation comes
        # before fix_nongap_extraction, and this can introduce an extra layer between
        # the phrasal tag and the trace
        for trace_NP, ctx in find_all(node, expr, with_context=True):
            pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s

            # remove T from P
            # replace P with S
            self.fix_object_gap(pp, p, t, s)

            if not self.relabel_relativiser(pred):
                top, context = get_first(node,
                                         r'/[ICV]P/=TOP $ *=SS',
                                         with_context=True)
                ss = context.ss

                debug("Creating null relativiser unary category: %s",
                      ss.category / ss.category)
                replace_kid(
                    top.parent, top,
                    Node("NN", [top], ss.category / ss.category, head_index=0))
Example #3
0
    def fix_long_bei_gap(self, node, bei, pred, top, n=None, reduced=False):
        debug("Fixing long bei gap: %s", lrp_repr(node))

        if not reduced:
            self.remove_null_element(top)

        if n:
            index = get_trace_index_from_tag(n.tag)
        else:
            index = r'\*'

        expr = r'*=PP < { *=P < { /NP-(?:TPC|OBJ)/=T < ^/%s/a $ *=S } }' % index
        trace_NP, ctx = get_first(top, expr, with_context=True)

        pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s
        # remove T from P
        # replace P with S
        self.fix_object_gap(pp, p, t, s)

        self.fix_categories_starting_from(s, until=top)
        self.relabel_bei_category(top, pred)

        top.category = top[0].category.left

        debug("done %s", pprint(top))
Example #4
0
    def fix_long_bei_gap(self, node, bei, pred, top, n=None, reduced=False):
        debug("Fixing long bei gap: %s", lrp_repr(node))

        if not reduced:
            self.remove_null_element(top)
            
        if n:
            index = get_trace_index_from_tag(n.tag)
        else:
            index = r'\*'

        expr = r'*=PP < { *=P < { /NP-(?:TPC|OBJ)/=T < ^/%s/a $ *=S } }' % index
        trace_NP, ctx = get_first(top, expr, with_context=True)

        pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s
        # remove T from P
        # replace P with S
        self.fix_object_gap(pp, p, t, s)

        self.fix_categories_starting_from(s, until=top)
        self.relabel_bei_category(top, pred)
        
        top.category = top[0].category.left

        debug("done %s", pprint(top))
Example #5
0
    def remove_null_element(self, node):
        # Remove the null element WHNP and its trace -NONE- '*OP*' and shrink tree
        pp, ctx = get_first(node,
                            r'*=PP < { *=P < { /WH[NP]P/=T $ *=S } }',
                            with_context=True)
        p, t, s = ctx.p, ctx.t, ctx.s

        replace_kid(pp, p, s)
Example #6
0
    def fix_subject_extraction(self, _, n, pred, w=None, reduced=False):
        global use_bare_N

        debug("%s", reduced)
        node = n
        debug("Fixing subject extraction: %s", lrp_repr(node))

        # We only want this if we are using the N -> NP unary rule
        # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N)
        if use_bare_N and pred.tag.startswith('NP'):
            # Fix for the NP(VP de) case:
            # ---------------------------
            #         NP                 NP
            #        /  \                |
            #      WHNP  CP     -->      CP
            #            / \            /  \
            #          IP  DEC         IP   DEC
            if not pred.is_leaf():
                pred.kids.pop(0)
                pred.head_index = 0
        else:
            if not reduced:
                self.remove_null_element(node)

        if w:
            index = get_trace_index_from_tag(w.tag)
        else:
            index = ''

        expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index

        for trace_NP, ctx in find_all(node, expr, with_context=True):
            pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s

            self.fix_object_gap(pp, p, t, s)
            self.fix_categories_starting_from(s, until=node)

            if not self.relabel_relativiser(pred):
                # TOP is the shrunk VP
                # after shrinking, we can get VV or VA here
                # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7))
                result = get_first(
                    node,
                    r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/',
                    with_context=True,
                    left_to_right=True)
                if not result:
                    debug(
                        'Could not find verbal category; did not create null relativiser.'
                    )
                    return

                top, context = result
                SS = context.ss.category

                debug("Creating null relativiser unary category: %s", SS / SS)
                replace_kid(top.parent, top,
                            Node("NN", [top], SS / SS, head_index=0))
Example #7
0
    def fix_object_extraction(self, _, n, pred, w=None, reduced=False):
        global use_bare_N

        node = n
        debug("Fixing object extraction: %s", lrp_repr(node))

        # We only want this if we are using the N -> NP unary rule
        # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N)
        if use_bare_N and pred.tag.startswith('NP'):
            # Fix for the NP(VP de) case:
            # ---------------------------
            #         NP                 NP
            #        /  \                |
            #      WHNP  CP     -->      CP
            #            / \            /  \
            #          IP  DEC         IP   DEC
            if not pred.is_leaf():
                pred.kids.pop(0)
                pred.head_index = 0
        else:
            if not reduced:
                self.remove_null_element(node)

        if w:
            index = get_trace_index_from_tag(w.tag)
        else:
            index = ''

        expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index

        for trace_NP, ctx in find_all(node, expr, with_context=True):
            top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s

            self.fix_object_gap(pp, p, t, s)
            self.fix_categories_starting_from(s, until=top)

            # If we couldn't find the DEC node, this is the null relativiser case
            if not self.relabel_relativiser(pred):
                # TOP is the S node
                # null relativiser category comes from sibling of TOP
                # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9))
                result = get_first(top,
                                   r'* $ *=SS',
                                   with_context=True,
                                   nonrecursive=True)
                if result:
                    _, ctx = result
                    ss = ctx.ss
                    debug("Creating null relativiser unary category: %s",
                          ss.category / ss.category)
                    replace_kid(
                        top.parent, top,
                        Node("NN", [top],
                             ss.category / ss.category,
                             head_index=0))
Example #8
0
    def relabel_bei_category(self, top, pred):
        # particle 'you' is tagged as a preposition but acts as the BEI marker
        bei, ctx = get_first(top, r'*=S [ $ /LB/=BEI | $ ^"由"=BEI | $ ^"经"=BEI | $ ^"经过"=BEI | $ ^"随"=BEI | $ ^"为"=BEI | $ ^"以"=BEI | $ ^"经由"=BEI ]', with_context=True)
        s, bei = ctx.s, ctx.bei

        bei.category = bei.category.clone_with(right=s.category)
        bei.category.left._right = pred.category
        
        bei.parent.category = bei.category.left
        
        debug("new bei category: %s", bei.category)
        return bei
Example #9
0
    def fix_subject_extraction(self, _, n, pred, w=None, reduced=False):
        global use_bare_N
        
        debug("%s", reduced)
        node = n
        debug("Fixing subject extraction: %s", lrp_repr(node))

        # We only want this if we are using the N -> NP unary rule
        # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N)
        if use_bare_N and pred.tag.startswith('NP'):
            # Fix for the NP(VP de) case:
            # ---------------------------
            #         NP                 NP
            #        /  \                |  
            #      WHNP  CP     -->      CP              
            #            / \            /  \           
            #          IP  DEC         IP   DEC
            if not pred.is_leaf():
                pred.kids.pop(0)
                pred.head_index = 0
        else:
            if not reduced:
                self.remove_null_element(node)

        if w:
            index = get_trace_index_from_tag(w.tag)
        else:
            index = ''
            
        expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index

        for trace_NP, ctx in find_all(node, expr, with_context=True):
            pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s

            self.fix_object_gap(pp, p, t, s)
            self.fix_categories_starting_from(s, until=node)

            if not self.relabel_relativiser(pred):
                # TOP is the shrunk VP
                # after shrinking, we can get VV or VA here
                # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7))
                result = get_first(node, r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/', with_context=True, left_to_right=True)
                if not result:
                    debug('Could not find verbal category; did not create null relativiser.')
                    return
                
                top, context = result
                SS = context.ss.category
                
                debug("Creating null relativiser unary category: %s", SS/SS)
                replace_kid(top.parent, top, Node("NN", [top], SS/SS, head_index=0))
Example #10
0
    def relabel_bei_category(self, top, pred):
        # particle 'you' is tagged as a preposition but acts as the BEI marker
        bei, ctx = get_first(
            top,
            r'*=S [ $ /LB/=BEI | $ ^"由"=BEI | $ ^"经"=BEI | $ ^"经过"=BEI | $ ^"随"=BEI | $ ^"为"=BEI | $ ^"以"=BEI | $ ^"经由"=BEI ]',
            with_context=True)
        s, bei = ctx.s, ctx.bei

        bei.category = bei.category.clone_with(right=s.category)
        bei.category.left._right = pred.category

        bei.parent.category = bei.category.left

        debug("new bei category: %s", bei.category)
        return bei
Example #11
0
    def relabel_relativiser(self, node):
        # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP)
        
        result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True)

        if result is not None:
            _, context = result
            s, relativiser = context.s, context.rel
            
            relativiser.category = relativiser.category.clone_with(right=s.category)
            debug("New rel category: %s", relativiser.category)

            return True
        else:
            warn("Couldn't find relativiser under %s", node)
            return False
Example #12
0
    def accept_derivation(self, bundle):
        found_rc = False
        root = bundle.derivation
        for node, ctx in tgrep(root, r'{ /SBAR/=SBAR < /WHNP/=WHNP } $ /NP/=NP', with_context=True):
            trace_finder = r"^/\*T\*%s/" % extract_index(ctx.whnp)
            trace_node = get_first(ctx.sbar, trace_finder)
            if trace_node is not None:
                if not found_rc:
                    self.rcderivs += 1
                    found_rc = True

                parent_type = trace_node.parent.tag
                parent_type = re.sub(r'-\d+$', '', parent_type)
                self.parents[ parent_type ] += 1
        
        self.total += 1
Example #13
0
    def fix_object_extraction(self, _, n, pred, w=None, reduced=False):
        global use_bare_N
        
        node = n
        debug("Fixing object extraction: %s", lrp_repr(node))
        
        # We only want this if we are using the N -> NP unary rule
        # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N)
        if use_bare_N and pred.tag.startswith('NP'):
            # Fix for the NP(VP de) case:
            # ---------------------------
            #         NP                 NP
            #        /  \                |  
            #      WHNP  CP     -->      CP              
            #            / \            /  \           
            #          IP  DEC         IP   DEC          
            if not pred.is_leaf():
                pred.kids.pop(0)
                pred.head_index = 0
        else:
            if not reduced:
                self.remove_null_element(node)
        
        if w:
            index = get_trace_index_from_tag(w.tag)
        else:
            index = ''
            
        expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index
        
        for trace_NP, ctx in find_all(node, expr, with_context=True):
            top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s

            self.fix_object_gap(pp, p, t, s)
            self.fix_categories_starting_from(s, until=top)

            # If we couldn't find the DEC node, this is the null relativiser case
            if not self.relabel_relativiser(pred):
                # TOP is the S node
                # null relativiser category comes from sibling of TOP
                # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9))
                result = get_first(top, r'* $ *=SS', with_context=True, nonrecursive=True)
                if result:
                    _, ctx = result; ss = ctx.ss
                    debug("Creating null relativiser unary category: %s", ss.category/ss.category)
                    replace_kid(top.parent, top, Node("NN", [top], ss.category/ss.category, head_index=0))
Example #14
0
    def accept_derivation(self, bundle):
        found_rc = False
        root = bundle.derivation
        for node, ctx in tgrep(root,
                               r'{ /SBAR/=SBAR < /WHNP/=WHNP } $ /NP/=NP',
                               with_context=True):
            trace_finder = r"^/\*T\*%s/" % extract_index(ctx.whnp)
            trace_node = get_first(ctx.sbar, trace_finder)
            if trace_node is not None:
                if not found_rc:
                    self.rcderivs += 1
                    found_rc = True

                parent_type = trace_node.parent.tag
                parent_type = re.sub(r'-\d+$', '', parent_type)
                self.parents[parent_type] += 1

        self.total += 1
Example #15
0
    def relabel_relativiser(self, node):
        # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP)

        result = get_first(node,
                           r'*=S $ /(DEC|SP)/=REL',
                           with_context=True,
                           left_to_right=True)

        if result is not None:
            _, context = result
            s, relativiser = context.s, context.rel

            relativiser.category = relativiser.category.clone_with(
                right=s.category)
            debug("New rel category: %s", relativiser.category)

            return True
        else:
            warn("Couldn't find relativiser under %s", node)
            return False
Example #16
0
 def accept_derivation(self, bundle):
     root = bundle.derivation
     self.freq['all'] += 1
     for name, pattern in name_to_pattern_map.items():
         if get_first(root, pattern): self.freq[name] += 1
Example #17
0
def preprocess(root):
    # IP < PP PU -> PP < PP PU (20:58(1))
    if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag

    for node in nodes(root):
        if node.is_leaf(): continue

        if rewrite_lcp_as_np and node.tag.startswith('LCP'):
            node.tag = node.tag.replace('LCP', 'NP')

        first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False)
        last_kid,  last_kid_index  = get_nonpunct_kid(node, get_last=True)
        # ---------------------
        # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP)
        if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)):
            lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node)
            rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node)
            if rqu != node.count()-1:
                quoted_kids = node.kids[lqu:rqu+1]
                del node.kids[lqu:rqu+1]

                last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True)
                # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore
                if last_nonpunct_kid:
                    quoted_node = Node(last_nonpunct_kid.tag, quoted_kids)
                    node.kids.insert(lqu, quoted_node)

        # CPTB/Chinese-specific fixes
        # ---------------------------
        # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP))
        if first_kid and first_kid.tag == "P" and node.count() > 2:
            last_tag = last_kid.tag
            rest = node.kids[1:]
            del node.kids[1:]
            node.kids.append(Node(last_tag, rest, node))
        # 2:12(3). DNP-PRD fixed by adding a layer of NP
        elif (node.tag.startswith('VP') and node.count() == 2 and
                node[0].tag.startswith('VC') and
                node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node)
        # fix missing -OBJ tag from VP object complements (c.f. 31:18(4))
        elif (node.tag.startswith('VP') and node.count() >= 2 and
              node.tag.startswith('VP') and
              node[0].tag == 'VV' and
              node[-1].tag == 'NP'): node[-1].tag += "-OBJ"
        # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5))
        elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')):
            node.kids = node[0].kids
        # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier,
        # just like a quantity
        elif node.tag == 'DP-SBJ':
            node.tag = 'QP-SBJ'
        # attach the PU preceding a PRN under the PRN
        elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1:
            maybe_pu = node[last_kid_index-1]
            if maybe_pu.tag == 'PU':
                del node.kids[last_kid_index-1]
                last_kid.kids.insert(0, maybe_pu) # prepend
        # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC.
        elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG':
            if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'):
                node[1].tag = 'DEC'

        elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node):
            for kid in node:
                if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP')

        # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis
        elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'):
            np_app, np_pn = node[1], node[2]
            del node.kids[1:]

            node.kids.append(Node(node.tag, [np_app, np_pn], node))

        # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19))
        elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP':
            advp = node.kids.pop(1)
            # VP is the new node[1]
            # now replace node[1] with Node(node[1])
            node[1] = Node(node[1].tag, [advp, node[1]], node)

        # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h)
        # only 3 cases: 23:61(5), 9:14(14), 21:3(11)
        elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG':
            replace_kid(node, node[0], Node('NP', [node[0]]))

        elif is_vnv(node) and node.count() == 3:
            # Re-analyse VNV as coordination
            node[1].tag = 'CC'

        # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1))
        elif node.count() == 1:
            # fix IP < VP by adding *pro*
            if node.tag.startswith('IP') and node[0].tag.startswith('VP'):
                leaf = Leaf('-NONE-', '*pro*', None)
                pro = Node('NP-SBJ', [leaf])

                node.kids.insert(0, pro)
            elif node[0].tag == 'JJ':
                if node.tag.startswith('ADVP'):
                    node.tag = node.tag.replace('ADVP', 'ADJP')
                elif node.tag.startswith('NP'):
                    node.tag = node.tag.replace('NP', 'ADJP')

            # fix NP < VV
            elif node.tag == 'NP' and node[0].tag == 'VV':
                node.tag = node.tag.replace('NP', 'VP')

            # fix NP < ADJP < JJ (5:35(1))
            elif node.tag == 'NP' and node[0].tag == 'ADJP':
                replace_kid(node.parent, node, node[0])

            # fix projections NP < QP
            elif node[0].tag.startswith('QP') and node.tag.startswith('NP'):
                inherit_tag(node[0], node) # copy PCTB tags from NP to QP
                node.tag = node[0].tag # copy QP to parent, replacing NP
                node.kids = node[0].kids
            elif node[0].tag == 'IP' and node.tag == 'CP-APP':
                inherit_tag(node[0], node)
                node.tag = node[0].tag
                node.kids = node[0].kids
            # CLP < NN
            elif node[0].tag == 'NN' and node.tag == 'CLP':
                node[0].tag = 'M'
            elif node[0].tag == 'NN' and node.tag.startswith("VP"):
                node[0].tag = 'VV'
            elif node[0].tag == 'CP':
                if node.tag == 'NP-PRD':
                    node.kids = node[0].kids
                else:
                    # Rewrite NP < { CP < { CP < DEC } } 
                    # (i.e. 比 报告 的 早 一点) so that it's headed by the 的
                    expr = r'''/CP/ < { /CP/ < /DEC/ }'''
                    if get_first(node[0], expr):
                        node.kids = node[0].kids
                        
            elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN':
                node.kids = node[0].kids
                
            # ADVP < CS: shrink so that CS will be considered the head by binarise
            # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47)
            elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or  
                  (node[0].tag == 'M' and node.tag == 'CP')):
                replace_kid(node.parent, node, node[0])
                
            # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4)
            elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'):
                node.kids = node[0].kids

            # elif node.tag == 'VP' and node[0].tag == 'NP-PRD':
            #     replace_kid(node.parent, node, node[0])
            
            # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3)
            # elif node.tag == 'VP' and node[0].tag.startswith('IP'):
            #     replace_kid(node.parent, node, node[0])
                
        # Reshape LB (long bei)
        # ---------------------
        elif first_kid and first_kid.tag == "LB":
            expr = r'''* < { /LB/=LB
                       [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP
                       | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }'''
            top, ctx = get_first(node, expr, with_context=True)

            lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip
            top.kids = [lb, Node('IP', [sbj, pred])]
            # top.kids = [lb, sbj, pred]
            
        # elif False:
        elif first_kid and first_kid.tag == "BA":
            expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }'''
                
            result = get_first(node, expr, with_context=True)
            if result:
                top, ctx = result

                lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip
    #            top.kids = [lb, Node('IP', [sbj, pred])]
                top.kids = [lb, sbj, pred]

        # single mistagging CP-SBJ for CP in 24:58(1)
        elif node.tag == 'CP-SBJ': node.tag = 'CP'
        
        else:
            # Fix missing phrasal layer in NP < NN DEG (21:10(4))
            result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix missing phrasal layer in LCP < NN LC (11:17(9))
            result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix wrongly attached DEC (5:26(6))
            result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True)
            if result:
                _, ctx = result
                top, p, dec = ctx.top, ctx.p, ctx.dec

                top.kids.append(dec)
                p.kids.remove(dec)

            result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True)
            if result:
                _, ctx = result
                pp, p, s = ctx.pp, ctx.p, ctx.s
                inherit_tag(s, p)
                replace_kid(pp, p, s)

            expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred

                del vp.kids
                if get_first(sbj, r'* < ^/\*PRO\*/'):
                    vp.kids = [v, pred]
                else:
                    vp.kids = [v, sbj, pred]

            expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                p = ctx.p

                if p.count() <= 3: continue

                cd_cc_cd, rest = p.kids[0:3], p.kids[3:]
                del p.kids[0:3]

                new_node = Node('QP', cd_cc_cd)
                p.kids.insert(0, new_node)

    return root
Example #18
0
def preprocess(root):
    # IP < PP PU -> PP < PP PU (20:58(1))
    if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith(
            'PP'):
        root.tag = root[0].tag

    for node in nodes(root):
        if node.is_leaf(): continue

        if rewrite_lcp_as_np and node.tag.startswith('LCP'):
            node.tag = node.tag.replace('LCP', 'NP')

        first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False)
        last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True)
        # ---------------------
        # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP)
        if any(kid.lex in ("“", "「")
               for kid in leaf_kids(node)) and any(kid.lex in ("”", "」")
                                                   for kid in leaf_kids(node)):
            lqu = first_index_such_that(
                lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node)
            rqu = first_index_such_that(
                lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node)
            if rqu != node.count() - 1:
                quoted_kids = node.kids[lqu:rqu + 1]
                del node.kids[lqu:rqu + 1]

                last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids,
                                                            get_last=True)
                # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore
                if last_nonpunct_kid:
                    quoted_node = Node(last_nonpunct_kid.tag, quoted_kids)
                    node.kids.insert(lqu, quoted_node)

        # CPTB/Chinese-specific fixes
        # ---------------------------
        # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP))
        if first_kid and first_kid.tag == "P" and node.count() > 2:
            last_tag = last_kid.tag
            rest = node.kids[1:]
            del node.kids[1:]
            node.kids.append(Node(last_tag, rest, node))
        # 2:12(3). DNP-PRD fixed by adding a layer of NP
        elif (node.tag.startswith('VP') and node.count() == 2
              and node[0].tag.startswith('VC')
              and node[1].tag.startswith('DNP-PRD')):
            node[1] = Node('NP', [node[1]], node)
            # fix missing -OBJ tag from VP object complements (c.f. 31:18(4))
        elif (node.tag.startswith('VP') and node.count() >= 2
              and node.tag.startswith('VP') and node[0].tag == 'VV'
              and node[-1].tag == 'NP'):
            node[-1].tag += "-OBJ"
            # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5))
        elif any(
                is_repeated_unary_projection(xp, node)
                for xp in ('IP', 'VP', 'NP', 'CP')):
            node.kids = node[0].kids
        # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier,
        # just like a quantity
        elif node.tag == 'DP-SBJ':
            node.tag = 'QP-SBJ'
        # attach the PU preceding a PRN under the PRN
        elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1:
            maybe_pu = node[last_kid_index - 1]
            if maybe_pu.tag == 'PU':
                del node.kids[last_kid_index - 1]
                last_kid.kids.insert(0, maybe_pu)  # prepend
        # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC.
        elif node.tag == 'CP' and node.count(
        ) == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG':
            if get_first(node[0],
                         r'^/\*T\*/') and not get_first(node[0], r'/DEC/'):
                node[1].tag = 'DEC'

        elif node.tag.startswith('NP') and any(
                kid.tag.startswith('QP-APP') for kid in node):
            for kid in node:
                if kid.tag.startswith('QP-APP'):
                    kid.tag = kid.tag.replace('QP', 'NP')

        # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis
        elif node.tag.startswith('NP') and node.count(
        ) == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith(
                'NP-APP') and node[2].tag.startswith('NP-PN'):
            np_app, np_pn = node[1], node[2]
            del node.kids[1:]

            node.kids.append(Node(node.tag, [np_app, np_pn], node))

        # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19))
        elif node.tag == 'IP' and node.count(
        ) == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[
                2].tag == 'VP':
            advp = node.kids.pop(1)
            # VP is the new node[1]
            # now replace node[1] with Node(node[1])
            node[1] = Node(node[1].tag, [advp, node[1]], node)

        # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h)
        # only 3 cases: 23:61(5), 9:14(14), 21:3(11)
        elif node.tag == 'DNP' and node.count(
        ) == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG':
            replace_kid(node, node[0], Node('NP', [node[0]]))

        elif is_vnv(node) and node.count() == 3:
            # Re-analyse VNV as coordination
            node[1].tag = 'CC'

        # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1))
        elif node.count() == 1:
            # fix IP < VP by adding *pro*
            if node.tag.startswith('IP') and node[0].tag.startswith('VP'):
                leaf = Leaf('-NONE-', '*pro*', None)
                pro = Node('NP-SBJ', [leaf])

                node.kids.insert(0, pro)
            elif node[0].tag == 'JJ':
                if node.tag.startswith('ADVP'):
                    node.tag = node.tag.replace('ADVP', 'ADJP')
                elif node.tag.startswith('NP'):
                    node.tag = node.tag.replace('NP', 'ADJP')

            # fix NP < VV
            elif node.tag == 'NP' and node[0].tag == 'VV':
                node.tag = node.tag.replace('NP', 'VP')

            # fix NP < ADJP < JJ (5:35(1))
            elif node.tag == 'NP' and node[0].tag == 'ADJP':
                replace_kid(node.parent, node, node[0])

            # fix projections NP < QP
            elif node[0].tag.startswith('QP') and node.tag.startswith('NP'):
                inherit_tag(node[0], node)  # copy PCTB tags from NP to QP
                node.tag = node[0].tag  # copy QP to parent, replacing NP
                node.kids = node[0].kids
            elif node[0].tag == 'IP' and node.tag == 'CP-APP':
                inherit_tag(node[0], node)
                node.tag = node[0].tag
                node.kids = node[0].kids
            # CLP < NN
            elif node[0].tag == 'NN' and node.tag == 'CLP':
                node[0].tag = 'M'
            elif node[0].tag == 'NN' and node.tag.startswith("VP"):
                node[0].tag = 'VV'
            elif node[0].tag == 'CP':
                if node.tag == 'NP-PRD':
                    node.kids = node[0].kids
                else:
                    # Rewrite NP < { CP < { CP < DEC } }
                    # (i.e. 比 报告 的 早 一点) so that it's headed by the 的
                    expr = r'''/CP/ < { /CP/ < /DEC/ }'''
                    if get_first(node[0], expr):
                        node.kids = node[0].kids

            elif node[0].tag in ('NP', 'NP-PN', 'VP',
                                 'IP') and node.tag == 'PRN':
                node.kids = node[0].kids

            # ADVP < CS: shrink so that CS will be considered the head by binarise
            # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47)
            elif ((node.tag == 'ADVP' and node[0].tag == 'CS')
                  or (node[0].tag == 'M' and node.tag == 'CP')):
                replace_kid(node.parent, node, node[0])

            # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4)
            elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'):
                node.kids = node[0].kids

            # elif node.tag == 'VP' and node[0].tag == 'NP-PRD':
            #     replace_kid(node.parent, node, node[0])

            # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3)
            # elif node.tag == 'VP' and node[0].tag.startswith('IP'):
            #     replace_kid(node.parent, node, node[0])

        # Reshape LB (long bei)
        # ---------------------
        elif first_kid and first_kid.tag == "LB":
            expr = r'''* < { /LB/=LB
                       [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP
                       | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }'''
            top, ctx = get_first(node, expr, with_context=True)

            lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip
            top.kids = [lb, Node('IP', [sbj, pred])]
            # top.kids = [lb, sbj, pred]

        # elif False:
        elif first_kid and first_kid.tag == "BA":
            expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }'''

            result = get_first(node, expr, with_context=True)
            if result:
                top, ctx = result

                lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip
                #            top.kids = [lb, Node('IP', [sbj, pred])]
                top.kids = [lb, sbj, pred]

        # single mistagging CP-SBJ for CP in 24:58(1)
        elif node.tag == 'CP-SBJ':
            node.tag = 'CP'

        else:
            # Fix missing phrasal layer in NP < NN DEG (21:10(4))
            result = get_first(node,
                               r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }',
                               with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix missing phrasal layer in LCP < NN LC (11:17(9))
            result = get_first(node,
                               r'/LCP/=P < { /N[NRT]/=N $ /LC/ }',
                               with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix wrongly attached DEC (5:26(6))
            result = get_first(
                node,
                r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }',
                with_context=True)
            if result:
                _, ctx = result
                top, p, dec = ctx.top, ctx.p, ctx.dec

                top.kids.append(dec)
                p.kids.remove(dec)

            result = get_first(
                node,
                r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }',
                nonrecursive=True,
                with_context=True)
            if result:
                _, ctx = result
                pp, p, s = ctx.pp, ctx.p, ctx.s
                inherit_tag(s, p)
                replace_kid(pp, p, s)

            expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred

                del vp.kids
                if get_first(sbj, r'* < ^/\*PRO\*/'):
                    vp.kids = [v, pred]
                else:
                    vp.kids = [v, sbj, pred]

            expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                p = ctx.p

                if p.count() <= 3: continue

                cd_cc_cd, rest = p.kids[0:3], p.kids[3:]
                del p.kids[0:3]

                new_node = Node('QP', cd_cc_cd)
                p.kids.insert(0, new_node)

    return root
Example #19
0
 def accept_derivation(self, bundle):
     root = bundle.derivation
     self.freq['all'] += 1
     for name, pattern in name_to_pattern_map.items():
         if get_first(root, pattern): self.freq[name] += 1                
Example #20
0
    def remove_null_element(self, node):
        # Remove the null element WHNP and its trace -NONE- '*OP*' and shrink tree
        pp, ctx = get_first(node, r'*=PP < { *=P < { /WH[NP]P/=T $ *=S } }', with_context=True)
        p, t, s = ctx.p, ctx.t, ctx.s

        replace_kid(pp, p, s)