def reshape_for_coordination(node, inside_np_internal_structure): if node.count() >= 3: # (XP PU) (CC XP) # if we get contiguous PU CC, associate the PU with the previous conjunct # but: # XP (PU XP) (CC XP) # XP (PU XP PU) (CC XP) # the rule is: # attach PU to the right _unless_ it is followed by CC kid_tag = base_tag(node.tag, strip_cptb_tag=False) kids = node.kids seen_cc = False last_kid, seen_cc = get_kid(kids, seen_cc) second_last_kid, seen_cc = get_kid(kids, seen_cc) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) while kids: kid, seen_cc = get_kid(kids, seen_cc) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag return cur return label_adjunction(node, inside_np_internal_structure=inside_np_internal_structure, do_labelling=False)
def label_predication(node, inherit_tag=False): kid_tag = strip_tag_if(not inherit_tag, node.tag) kids = map(label_node, node.kids) last_kid, second_last_kid = twice(get_kid_)(kids) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) while kids: kid = get_kid_(kids) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag # restore the full tag at the topmost level return cur
def label_head_initial(node, inherit_tag=False): if has_tag(node, 'c'): inherit_tag=False kid_tag = strip_tag_if(not inherit_tag, node.tag) kids = map(label_node, node.kids)[::-1] first_kid, second_kid = twice(kids.pop)() cur = Node(kid_tag, [first_kid, second_kid], head_index=0) while kids: kid = kids.pop() cur = Node(kid_tag, [cur, kid], head_index=0) cur.tag = node.tag return cur
def label_apposition(node, inherit_tag=False, inside_np_internal_structure=False): kid_tag = strip_tag_if(not inherit_tag, node.tag) kids = map(lambda node: label_node(node, inside_np_internal_structure=inside_np_internal_structure), node.kids) last_kid = get_kid_(kids) if kids: second_last_kid = get_kid_(kids) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) else: cur = last_kid while kids: kid = get_kid_(kids) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag return cur
def label_adjunction(node, inherit_tag=False, do_labelling=True, inside_np_internal_structure=False): kid_tag = strip_tag_if(not inherit_tag, node.tag) if do_labelling: kids = map(lambda node: label_node(node, inside_np_internal_structure=inside_np_internal_structure), node.kids) else: kids = node.kids # last_kid, second_last_kid = twice(kids.pop)() last_kid = get_kid_(kids) if kids: second_last_kid = get_kid_(kids) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) else: cur = last_kid while kids: kid = get_kid_(kids) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag return cur
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count()-1: quoted_kids = node.kids[lqu:rqu+1] del node.kids[lqu:rqu+1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index-1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index-1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root