def label(root): root = preprocess(root) for node in nodes(root): if node.is_leaf(): continue at_top = False if node.parent is None: at_top = True first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) if first_kid is None: continue for kid in node: if has_modification_tag(kid): tag(kid, 'm') elif kid.tag == 'MSP': tag(kid, 'a') elif kid.tag == 'FLR': tag(kid, 'a') elif kid.tag == 'ETC': tag(kid, '&') else: tag_if_topicalisation(kid) if is_prn(node): # PRN tagging error in 10:49(69) if not first_kid: continue node.tag = first_kid.tag tag(node, 'p') tag(node[0], 'h') # assume that the first PU introduces the PRN elif node.tag == 'FRAG': tag_adjunction(node, last_kid) # occasionally something that looks like CCG right absorption occurs in the original annotation (0:23(8)) elif is_right_absorption(node): pass elif is_predication(node): sbj_assigned = False vp_assigned = False for kid in reversed(node): if not sbj_assigned and (kid.tag.rfind('-SBJ') != -1 or # TODO: we can get IP < NP-PN VP (0:40(5)). is this correct? # exclude NP-PN-LOC (10:62(25)) (kid.tag.rfind('-PN') != -1 and kid.tag.rfind('-PN-LOC') == -1) or # NP-APP VP in 11:31(88) (kid.tag.rfind('-APP') != -1) or kid.tag == "NP"): tag(kid, 'l') # TODO: is subject always left of predicate? sbj_assigned = True elif not vp_assigned and kid.tag == 'VP': tag(kid, 'h') vp_assigned = True # elif _has_modification_tag(kid) and kid.tag.startswith('IP'): # tag(kid, 'm') elif kid.tag not in ('PU', 'CC'): tag(kid, 'a') if punct_cued_typechange: for i, kid in enumerate(node): try: # exclude IP-SBJ PU VP from having the PU tagged :h (1:53(9)) if (kid.tag.startswith('IP-') or kid.tag.startswith('CP-')) and \ kid.tag.find('-TPC') == -1 and \ kid.tag.find('-SBJ') == -1 and \ node[i+1].tag == 'PU': tag(node[i+1], 'h') except: continue elif node.count() == 1 and node.tag.startswith('VP') and is_verb_compound(node[0]): pass elif is_vpt(node): # fen de kai, da bu ying. vpt is head-final left = True for kid in node: if kid.tag.startswith("AD") or kid.tag.startswith("DER"): tag(kid, 'h') left = False elif left: tag(kid, 'l') else: tag(kid, 'r') elif is_vsb(node): # VSB is modifier+head, and hence is head-final tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_vcd(node): pass elif is_vcp(node): tag(first_kid, 'h') for kid in node[1:]: if kid.tag == "VC": tag(kid, 'a') elif is_vrd(node) or is_vsb(node): # vrd is head-initial tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_coordination(node, at_top=at_top): # coordination for kid in node: if kid.tag == "ETC": tag(kid, '&') if kid.tag not in ('CC', 'PU', 'CSC'): tag(kid, 'c') elif is_np_internal_structure(node): first = True for kid in reversed(node.kids): # if kid.tag.startswith('PRN'): continue if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): if first: tag(kid, 'N') first = False else: tag(kid, 'n') else: pass # must be above is_coordination (it subsumes UCP) elif is_ucp(node): left_conjunct_tag = first_kid.tag # NOTE: # There are some cases where the UCP annotation is suspect (1:36(11)) # we will obtain the wrong analysis in these cases because the UCP node # does not directly dominate its conjuncts old_tag = node.tag node.tag = left_conjunct_tag node.tag = inherit_tag_str(node.tag, old_tag) for kid in nodes(node): if kid.tag is None: print kid print kid.tag if kid.tag.startswith('UCP'): kid.tag = left_conjunct_tag for kid in node: if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): tag(kid, 'C') # exclude VP < VV AS: we want to tag this elif (node.count() == 1) or is_verb_compound(node): pass elif ((first_kid.is_leaf() # head initial complementation # quoted verb (see fix in _preprocess_ function) # or all((kid.is_leaf() and kid.tag in ('PU', 'VV')) for kid in first_kid) or satisfies_all( lambda fkid: any(kid.is_leaf() and kid.tag == 'PU' for kid in fkid), lambda fkid: any(kid.is_leaf() and kid.tag == 'VV' for kid in fkid))(first_kid) or is_verb_compound(first_kid) # HACK: to fix weird case of unary PP < P causing adjunction analysis instead of head-initial # (because of PP IP configuration) in 10:76(4) or first_kid.tag == 'PP' and first_kid.count() == 1 and first_kid[0].tag == "P")): tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') # head final complementation # This has to come after head initial complementation, otherwise we get the tagging VP < VV:l AS:h elif (last_kid.is_leaf() or is_verb_compound(last_kid) or # lcp internal structure (cf 10:2(13)) is possible: despite the structure (LCP (NP) (LCP)) # this should be treated as head-final complementation, not adjunction. is_lcp_internal_structure(last_kid)) and not any(first_kid.tag.find(ftag) != -1 for ftag in FunctionTags): if last_kid.tag.startswith('SP'): # Treat final 吗 as the head to get the type-change category S[q]\S[dcl] (25:21(5)) if has_question_tag(node): tag(last_kid, 'h') else: tag(last_kid, 'a') else: tag(last_kid, 'h') # cf 2:23(7),1:9(28), a number of derivations have (CP(WHNP-1 CP(IP) DEC) XP) instead of # the expected (CP (WHNP-1) CP(IP DEC) XP) # This lets us treat what would otherwise be considered head-final as an # adjunction if last_kid.tag.startswith('DEC'): for kid in node[0:-1]: if kid.tag.startswith('WHNP') or kid.tag.startswith('WHPP'): tag(kid, 'a') elif not (kid.tag.startswith('PU') or kid.tag.startswith('ADVP')): # ADVP as sibling of IP in 11:39(63) tag(kid, 'l') else: for kid in node[0:-1]: if (last_kid.tag in VerbalCategories and ( is_postverbal_adjunct_tag(kid.tag) or # exception added to account for direct modification of V{V,A} with ADVP (0:47(9)) kid.tag.startswith('ADVP'))): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'l') # TODO: if this is below coordination, then NP(NP-APP PU NP) is considered coordination instead of apposition (10:70(15)) # actually, what happens if we get english-style apposition (NP1 , NP2)? elif is_apposition(node): if False:#any(kid.tag == 'IP-APP' for kid in node): tag(last_kid, 'h') else: tag(last_kid, 'r') # HACK: assume apposition is right-headed for kid in node: if not kid.tag.startswith('PU'): # exclude CP-APP (see is_apposition() above) if kid.tag.endswith('-APP') and not kid.tag.startswith('CP'): tag(kid, 'A') else: tag(kid, 'a') elif is_modification(node): tag(last_kid, 'h') for kid in node[0:-1]: if has_modification_tag(kid): tag(kid, 'm') elif not kid.tag.startswith('PU'): tag(kid, 'a') else: tag_if_topicalisation(kid) elif is_argument_cluster(node): for kid in node: tag(kid, '@') else: # adjunction tag_adjunction(node, last_kid) root = postprocess(root) return root
def label(root): root = preprocess(root) for node in nodes(root): if node.is_leaf(): continue at_top = False if node.parent is None: at_top = True first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) if first_kid is None: continue for kid in node: if has_modification_tag(kid): tag(kid, 'm') elif kid.tag == 'MSP': tag(kid, 'a') elif kid.tag == 'FLR': tag(kid, 'a') elif kid.tag == 'ETC': tag(kid, '&') else: tag_if_topicalisation(kid) if is_prn(node): # PRN tagging error in 10:49(69) if not first_kid: continue node.tag = first_kid.tag tag(node, 'p') tag(node[0], 'h') # assume that the first PU introduces the PRN elif node.tag == 'FRAG': tag_adjunction(node, last_kid) # occasionally something that looks like CCG right absorption occurs in the original annotation (0:23(8)) elif is_right_absorption(node): pass elif is_predication(node): sbj_assigned = False vp_assigned = False for kid in reversed(node): if not sbj_assigned and ( kid.tag.rfind('-SBJ') != -1 or # TODO: we can get IP < NP-PN VP (0:40(5)). is this correct? # exclude NP-PN-LOC (10:62(25)) (kid.tag.rfind('-PN') != -1 and kid.tag.rfind('-PN-LOC') == -1) or # NP-APP VP in 11:31(88) (kid.tag.rfind('-APP') != -1) or kid.tag == "NP"): tag(kid, 'l') # TODO: is subject always left of predicate? sbj_assigned = True elif not vp_assigned and kid.tag == 'VP': tag(kid, 'h') vp_assigned = True # elif _has_modification_tag(kid) and kid.tag.startswith('IP'): # tag(kid, 'm') elif kid.tag not in ('PU', 'CC'): tag(kid, 'a') if punct_cued_typechange: for i, kid in enumerate(node): try: # exclude IP-SBJ PU VP from having the PU tagged :h (1:53(9)) if (kid.tag.startswith('IP-') or kid.tag.startswith('CP-')) and \ kid.tag.find('-TPC') == -1 and \ kid.tag.find('-SBJ') == -1 and \ node[i+1].tag == 'PU': tag(node[i + 1], 'h') except: continue elif node.count() == 1 and node.tag.startswith( 'VP') and is_verb_compound(node[0]): pass elif is_vpt(node): # fen de kai, da bu ying. vpt is head-final left = True for kid in node: if kid.tag.startswith("AD") or kid.tag.startswith("DER"): tag(kid, 'h') left = False elif left: tag(kid, 'l') else: tag(kid, 'r') elif is_vsb(node): # VSB is modifier+head, and hence is head-final tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag( kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_vcd(node): pass elif is_vcp(node): tag(first_kid, 'h') for kid in node[1:]: if kid.tag == "VC": tag(kid, 'a') elif is_vrd(node) or is_vsb(node): # vrd is head-initial tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag( kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_coordination(node, at_top=at_top): # coordination for kid in node: if kid.tag == "ETC": tag(kid, '&') if kid.tag not in ('CC', 'PU', 'CSC'): tag(kid, 'c') elif is_np_internal_structure(node): first = True for kid in reversed(node.kids): # if kid.tag.startswith('PRN'): continue if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): if first: tag(kid, 'N') first = False else: tag(kid, 'n') else: pass # must be above is_coordination (it subsumes UCP) elif is_ucp(node): left_conjunct_tag = first_kid.tag # NOTE: # There are some cases where the UCP annotation is suspect (1:36(11)) # we will obtain the wrong analysis in these cases because the UCP node # does not directly dominate its conjuncts old_tag = node.tag node.tag = left_conjunct_tag node.tag = inherit_tag_str(node.tag, old_tag) for kid in nodes(node): if kid.tag is None: print kid print kid.tag if kid.tag.startswith('UCP'): kid.tag = left_conjunct_tag for kid in node: if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): tag(kid, 'C') # exclude VP < VV AS: we want to tag this elif (node.count() == 1) or is_verb_compound(node): pass elif (( first_kid.is_leaf() # head initial complementation # quoted verb (see fix in _preprocess_ function) # or all((kid.is_leaf() and kid.tag in ('PU', 'VV')) for kid in first_kid) or satisfies_all( lambda fkid: any(kid.is_leaf() and kid.tag == 'PU' for kid in fkid), lambda fkid: any(kid.is_leaf() and kid.tag == 'VV' for kid in fkid))(first_kid) or is_verb_compound(first_kid) # HACK: to fix weird case of unary PP < P causing adjunction analysis instead of head-initial # (because of PP IP configuration) in 10:76(4) or first_kid.tag == 'PP' and first_kid.count() == 1 and first_kid[0].tag == "P")): tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag( kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') # head final complementation # This has to come after head initial complementation, otherwise we get the tagging VP < VV:l AS:h elif (last_kid.is_leaf() or is_verb_compound(last_kid) or # lcp internal structure (cf 10:2(13)) is possible: despite the structure (LCP (NP) (LCP)) # this should be treated as head-final complementation, not adjunction. is_lcp_internal_structure(last_kid)) and not any( first_kid.tag.find(ftag) != -1 for ftag in FunctionTags): if last_kid.tag.startswith('SP'): # Treat final 吗 as the head to get the type-change category S[q]\S[dcl] (25:21(5)) if has_question_tag(node): tag(last_kid, 'h') else: tag(last_kid, 'a') else: tag(last_kid, 'h') # cf 2:23(7),1:9(28), a number of derivations have (CP(WHNP-1 CP(IP) DEC) XP) instead of # the expected (CP (WHNP-1) CP(IP DEC) XP) # This lets us treat what would otherwise be considered head-final as an # adjunction if last_kid.tag.startswith('DEC'): for kid in node[0:-1]: if kid.tag.startswith('WHNP') or kid.tag.startswith( 'WHPP'): tag(kid, 'a') elif not (kid.tag.startswith('PU') or kid.tag.startswith('ADVP') ): # ADVP as sibling of IP in 11:39(63) tag(kid, 'l') else: for kid in node[0:-1]: if (last_kid.tag in VerbalCategories and (is_postverbal_adjunct_tag(kid.tag) or # exception added to account for direct modification of V{V,A} with ADVP (0:47(9)) kid.tag.startswith('ADVP'))): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'l') # TODO: if this is below coordination, then NP(NP-APP PU NP) is considered coordination instead of apposition (10:70(15)) # actually, what happens if we get english-style apposition (NP1 , NP2)? elif is_apposition(node): if False: #any(kid.tag == 'IP-APP' for kid in node): tag(last_kid, 'h') else: tag(last_kid, 'r') # HACK: assume apposition is right-headed for kid in node: if not kid.tag.startswith('PU'): # exclude CP-APP (see is_apposition() above) if kid.tag.endswith( '-APP') and not kid.tag.startswith('CP'): tag(kid, 'A') else: tag(kid, 'a') elif is_modification(node): tag(last_kid, 'h') for kid in node[0:-1]: if has_modification_tag(kid): tag(kid, 'm') elif not kid.tag.startswith('PU'): tag(kid, 'a') else: tag_if_topicalisation(kid) elif is_argument_cluster(node): for kid in node: tag(kid, '@') else: # adjunction tag_adjunction(node, last_kid) root = postprocess(root) return root
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count()-1: quoted_kids = node.kids[lqu:rqu+1] del node.kids[lqu:rqu+1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index-1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index-1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith( 'PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that( lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that( lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count() - 1: quoted_kids = node.kids[lqu:rqu + 1] del node.kids[lqu:rqu + 1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any( is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index - 1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index - 1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count( ) == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any( kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count( ) == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith( 'NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count( ) == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[ 2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count( ) == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first( node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first( node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root