def fix_nongap_extraction(self, _, n, pred, k): node = n debug("Fixing nongap extraction: %s", pprint(node)) debug("k %s", pprint(k)) self.remove_null_element(node) index = get_trace_index_from_tag(k.tag) expr = (r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }' % { 'tags': ModifierTagsRegex, 'index': index }) # we use "<<" in the expression, because fix_*_topicalisation comes # before fix_nongap_extraction, and this can introduce an extra layer between # the phrasal tag and the trace for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) if not self.relabel_relativiser(pred): top, context = get_first(node, r'/[ICV]P/=TOP $ *=SS', with_context=True) ss = context.ss debug("Creating null relativiser unary category: %s", ss.category/ss.category) replace_kid(top.parent, top, Node("NN", [top], ss.category/ss.category, head_index=0))
def fix_nongap_extraction(self, _, n, pred, k): node = n debug("Fixing nongap extraction: %s", pprint(node)) debug("k %s", pprint(k)) self.remove_null_element(node) index = get_trace_index_from_tag(k.tag) expr = ( r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }' % { 'tags': ModifierTagsRegex, 'index': index }) # we use "<<" in the expression, because fix_*_topicalisation comes # before fix_nongap_extraction, and this can introduce an extra layer between # the phrasal tag and the trace for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) if not self.relabel_relativiser(pred): top, context = get_first(node, r'/[ICV]P/=TOP $ *=SS', with_context=True) ss = context.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def fix_long_bei_gap(self, node, bei, pred, top, n=None, reduced=False): debug("Fixing long bei gap: %s", lrp_repr(node)) if not reduced: self.remove_null_element(top) if n: index = get_trace_index_from_tag(n.tag) else: index = r'\*' expr = r'*=PP < { *=P < { /NP-(?:TPC|OBJ)/=T < ^/%s/a $ *=S } }' % index trace_NP, ctx = get_first(top, expr, with_context=True) pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) self.relabel_bei_category(top, pred) top.category = top[0].category.left debug("done %s", pprint(top))
def remove_null_element(self, node): # Remove the null element WHNP and its trace -NONE- '*OP*' and shrink tree pp, ctx = get_first(node, r'*=PP < { *=P < { /WH[NP]P/=T $ *=S } }', with_context=True) p, t, s = ctx.p, ctx.t, ctx.s replace_kid(pp, p, s)
def fix_subject_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N debug("%s", reduced) node = n debug("Fixing subject extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=node) if not self.relabel_relativiser(pred): # TOP is the shrunk VP # after shrinking, we can get VV or VA here # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7)) result = get_first( node, r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/', with_context=True, left_to_right=True) if not result: debug( 'Could not find verbal category; did not create null relativiser.' ) return top, context = result SS = context.ss.category debug("Creating null relativiser unary category: %s", SS / SS) replace_kid(top.parent, top, Node("NN", [top], SS / SS, head_index=0))
def fix_object_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N node = n debug("Fixing object extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) # If we couldn't find the DEC node, this is the null relativiser case if not self.relabel_relativiser(pred): # TOP is the S node # null relativiser category comes from sibling of TOP # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9)) result = get_first(top, r'* $ *=SS', with_context=True, nonrecursive=True) if result: _, ctx = result ss = ctx.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def relabel_bei_category(self, top, pred): # particle 'you' is tagged as a preposition but acts as the BEI marker bei, ctx = get_first(top, r'*=S [ $ /LB/=BEI | $ ^"由"=BEI | $ ^"经"=BEI | $ ^"经过"=BEI | $ ^"随"=BEI | $ ^"为"=BEI | $ ^"以"=BEI | $ ^"经由"=BEI ]', with_context=True) s, bei = ctx.s, ctx.bei bei.category = bei.category.clone_with(right=s.category) bei.category.left._right = pred.category bei.parent.category = bei.category.left debug("new bei category: %s", bei.category) return bei
def fix_subject_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N debug("%s", reduced) node = n debug("Fixing subject extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=node) if not self.relabel_relativiser(pred): # TOP is the shrunk VP # after shrinking, we can get VV or VA here # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7)) result = get_first(node, r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/', with_context=True, left_to_right=True) if not result: debug('Could not find verbal category; did not create null relativiser.') return top, context = result SS = context.ss.category debug("Creating null relativiser unary category: %s", SS/SS) replace_kid(top.parent, top, Node("NN", [top], SS/SS, head_index=0))
def relabel_bei_category(self, top, pred): # particle 'you' is tagged as a preposition but acts as the BEI marker bei, ctx = get_first( top, r'*=S [ $ /LB/=BEI | $ ^"由"=BEI | $ ^"经"=BEI | $ ^"经过"=BEI | $ ^"随"=BEI | $ ^"为"=BEI | $ ^"以"=BEI | $ ^"经由"=BEI ]', with_context=True) s, bei = ctx.s, ctx.bei bei.category = bei.category.clone_with(right=s.category) bei.category.left._right = pred.category bei.parent.category = bei.category.left debug("new bei category: %s", bei.category) return bei
def relabel_relativiser(self, node): # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP) result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True) if result is not None: _, context = result s, relativiser = context.s, context.rel relativiser.category = relativiser.category.clone_with(right=s.category) debug("New rel category: %s", relativiser.category) return True else: warn("Couldn't find relativiser under %s", node) return False
def accept_derivation(self, bundle): found_rc = False root = bundle.derivation for node, ctx in tgrep(root, r'{ /SBAR/=SBAR < /WHNP/=WHNP } $ /NP/=NP', with_context=True): trace_finder = r"^/\*T\*%s/" % extract_index(ctx.whnp) trace_node = get_first(ctx.sbar, trace_finder) if trace_node is not None: if not found_rc: self.rcderivs += 1 found_rc = True parent_type = trace_node.parent.tag parent_type = re.sub(r'-\d+$', '', parent_type) self.parents[ parent_type ] += 1 self.total += 1
def fix_object_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N node = n debug("Fixing object extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) # If we couldn't find the DEC node, this is the null relativiser case if not self.relabel_relativiser(pred): # TOP is the S node # null relativiser category comes from sibling of TOP # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9)) result = get_first(top, r'* $ *=SS', with_context=True, nonrecursive=True) if result: _, ctx = result; ss = ctx.ss debug("Creating null relativiser unary category: %s", ss.category/ss.category) replace_kid(top.parent, top, Node("NN", [top], ss.category/ss.category, head_index=0))
def accept_derivation(self, bundle): found_rc = False root = bundle.derivation for node, ctx in tgrep(root, r'{ /SBAR/=SBAR < /WHNP/=WHNP } $ /NP/=NP', with_context=True): trace_finder = r"^/\*T\*%s/" % extract_index(ctx.whnp) trace_node = get_first(ctx.sbar, trace_finder) if trace_node is not None: if not found_rc: self.rcderivs += 1 found_rc = True parent_type = trace_node.parent.tag parent_type = re.sub(r'-\d+$', '', parent_type) self.parents[parent_type] += 1 self.total += 1
def relabel_relativiser(self, node): # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP) result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True) if result is not None: _, context = result s, relativiser = context.s, context.rel relativiser.category = relativiser.category.clone_with( right=s.category) debug("New rel category: %s", relativiser.category) return True else: warn("Couldn't find relativiser under %s", node) return False
def accept_derivation(self, bundle): root = bundle.derivation self.freq['all'] += 1 for name, pattern in name_to_pattern_map.items(): if get_first(root, pattern): self.freq[name] += 1
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count()-1: quoted_kids = node.kids[lqu:rqu+1] del node.kids[lqu:rqu+1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index-1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index-1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith( 'PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that( lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that( lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count() - 1: quoted_kids = node.kids[lqu:rqu + 1] del node.kids[lqu:rqu + 1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any( is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index - 1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index - 1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count( ) == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any( kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count( ) == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith( 'NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count( ) == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[ 2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count( ) == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first( node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first( node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root