def reshape_for_coordination(node, inside_np_internal_structure): if node.count() >= 3: # (XP PU) (CC XP) # if we get contiguous PU CC, associate the PU with the previous conjunct # but: # XP (PU XP) (CC XP) # XP (PU XP PU) (CC XP) # the rule is: # attach PU to the right _unless_ it is followed by CC kid_tag = base_tag(node.tag, strip_cptb_tag=False) kids = node.kids seen_cc = False last_kid, seen_cc = get_kid(kids, seen_cc) second_last_kid, seen_cc = get_kid(kids, seen_cc) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) while kids: kid, seen_cc = get_kid(kids, seen_cc) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag return cur return label_adjunction(node, inside_np_internal_structure=inside_np_internal_structure, do_labelling=False)
def clusterfix(self, top, pp, p, s, t): debug("Fixing argument cluster coordination: %s", pprint(top)) debug('T: %s', t) # 1. Shrink the verb (node T) self.fix_object_gap(pp, p, t, s) # 2. Reattach the verb above the TOP node new_node = Node('TAG', top.kids, top.category, head_index=0) top.kids = [t, new_node] # (Reattaching parent pointers) for kid in new_node: kid.parent = new_node # 3. Find and relabel argument clusters for node, ctx in find_all(top, r'/VP/=VP <1 /NP/=NP <2 /(QP|V[PV])/=QP', with_context=True): vp, np, qp = ctx.vp, ctx.np, ctx.qp # Now, VP should have category ((S[dcl]\NP)/QP)/NP SbNP = t.category.left.left QP, NP = qp.category, np.category # NP should have category ((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP) new_np_category = (SbNP / QP) | ((SbNP / QP) / NP) # QP should have category ((S[dcl]\NP)\((S[dcl]\NP)/QP)) new_qp_category = (SbNP) | ((SbNP) / QP) # insert unary nodes new_np_node = Node(np.tag, [np], new_np_category, head_index=0) np.parent = new_np_node new_qp_node = Node(qp.tag, [qp], new_qp_category, head_index=0) qp.parent = new_qp_node replace_kid(vp, np, new_np_node) replace_kid(vp, qp, new_qp_node) self.fix_categories_starting_from(new_np_node, top)
def label_head_initial(node, inherit_tag=False): if has_tag(node, 'c'): inherit_tag=False kid_tag = strip_tag_if(not inherit_tag, node.tag) kids = map(label_node, node.kids)[::-1] first_kid, second_kid = twice(kids.pop)() cur = Node(kid_tag, [first_kid, second_kid], head_index=0) while kids: kid = kids.pop() cur = Node(kid_tag, [cur, kid], head_index=0) cur.tag = node.tag return cur
def label_predication(node, inherit_tag=False): kid_tag = strip_tag_if(not inherit_tag, node.tag) kids = map(label_node, node.kids) last_kid, second_last_kid = twice(get_kid_)(kids) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) while kids: kid = get_kid_(kids) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag # restore the full tag at the topmost level return cur
def fix_ip_app(self, p, a, s): debug("Fixing IP-APP NX: %s", lrp_repr(p)) new_kid = copy(a) new_kid.tag = base_tag( new_kid.tag) # relabel to stop infinite matching replace_kid( p, a, Node("NN", [new_kid], s.category / s.category, head_index=0))
def fix_nongap_extraction(self, _, n, pred, k): node = n debug("Fixing nongap extraction: %s", pprint(node)) debug("k %s", pprint(k)) self.remove_null_element(node) index = get_trace_index_from_tag(k.tag) expr = ( r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }' % { 'tags': ModifierTagsRegex, 'index': index }) # we use "<<" in the expression, because fix_*_topicalisation comes # before fix_nongap_extraction, and this can introduce an extra layer between # the phrasal tag and the trace for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) if not self.relabel_relativiser(pred): top, context = get_first(node, r'/[ICV]P/=TOP $ *=SS', with_context=True) ss = context.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def label_apposition(node, inherit_tag=False, inside_np_internal_structure=False): kid_tag = strip_tag_if(not inherit_tag, node.tag) kids = map(lambda node: label_node(node, inside_np_internal_structure=inside_np_internal_structure), node.kids) last_kid = get_kid_(kids) if kids: second_last_kid = get_kid_(kids) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) else: cur = last_kid while kids: kid = get_kid_(kids) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag return cur
def fix_topicalisation_without_gap(self, node, p, s, t): debug("Fixing topicalisation without gap: %s", pprint(node)) new_kid = copy(t) new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False) new_category = featureless(p.category) / featureless(s.category) replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))
def fix_subject_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N debug("%s", reduced) node = n debug("Fixing subject extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=node) if not self.relabel_relativiser(pred): # TOP is the shrunk VP # after shrinking, we can get VV or VA here # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7)) result = get_first( node, r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/', with_context=True, left_to_right=True) if not result: debug( 'Could not find verbal category; did not create null relativiser.' ) return top, context = result SS = context.ss.category debug("Creating null relativiser unary category: %s", SS / SS) replace_kid(top.parent, top, Node("NN", [top], SS / SS, head_index=0))
def fix_object_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N node = n debug("Fixing object extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) # If we couldn't find the DEC node, this is the null relativiser case if not self.relabel_relativiser(pred): # TOP is the S node # null relativiser category comes from sibling of TOP # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9)) result = get_first(top, r'* $ *=SS', with_context=True, nonrecursive=True) if result: _, ctx = result ss = ctx.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def fix_modification(self, node, p, s, t): debug("Fixing modification: %s", lrp_repr(node)) S, P = s.category, p.category # If you don't strip the tag :m from the newly created child (new_kid), # the fix_modification pattern will match infinitely when tgrep visits new_kid new_kid = copy(t) new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False) new_category = featureless(P) / featureless(S) debug("Creating category %s", new_category) replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))
def label_adjunction(node, inherit_tag=False, do_labelling=True, inside_np_internal_structure=False): kid_tag = strip_tag_if(not inherit_tag, node.tag) if do_labelling: kids = map(lambda node: label_node(node, inside_np_internal_structure=inside_np_internal_structure), node.kids) else: kids = node.kids # last_kid, second_last_kid = twice(kids.pop)() last_kid = get_kid_(kids) if kids: second_last_kid = get_kid_(kids) cur = Node(kid_tag, [second_last_kid, last_kid], head_index=1) else: cur = last_kid while kids: kid = get_kid_(kids) cur = Node(kid_tag, [kid, cur], head_index=1) cur.tag = node.tag return cur
def accept_derivation(self, bundle): for node, ctx in find_all(bundle.derivation, expr, with_context=True): u = ctx.n.lex.decode('u8') if u[0] in baixing: leaf = ctx.n kids = [ Leaf(leaf.tag, u[0].encode('u8'), None), Leaf(leaf.tag, u[1:].encode('u8'), None) ] replace_kid(ctx.n.parent, ctx.n, Node('NR', kids)) #node.kids = kids self.write_derivation(bundle)
def fix_rnr(self, rnr, g): # G is the node dominating all the conjuncts rnr_tags = [] for node, ctx in find_all(g, r'/:c/a', with_context=True): for rnr in find_all(node, r'^/\*RNR\*/'): rnr_tags.append(get_trace_index_from_tag(rnr.lex)) for index in rnr_tags: for node, ctx in find_all( g, r'*=PP < { *=P < { *=T < ^/\*RNR\*%s/ $ *=S } }' % index, with_context=True): inherit_tag(ctx.s, ctx.p) self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s) self.fix_categories_starting_from(ctx.s, g) # This breaks with the IP (LC CC LC) case in 9:19(11) -- last_conjunct returns None # because the last conjunct has been shrunk last_conjunct = list(find_first(g, r'/:c/a', left_to_right=False)) args = [] # Here, we uniquify the rnr tags so that we excise each shared argument only once for index in set(rnr_tags): # find_first, because we only want to find one match, the shallowest. # cf 7:27(10), if NP-OBJ-2(NN NP-OBJ-2(JJ NN)), then we only want to identify # one matching node for index -2 -- the shallowest -- and not two. for node, ctx in find_first(last_conjunct[0], r'*=P < { /%s/a=T $ *=S }' % index, with_context=True): args.append(ctx.t) # Note: last_conjunct may be disconnected from # the tree by replace_kid (when ctx.p == last_conjunct) replace_kid(ctx.p.parent, ctx.p, ctx.s) self.fix_categories_starting_from(ctx.s, g) # Because the find_all which retrieved the args is an in-order left-to-right traversal, it will find # shallower nodes before deeper nodes. Therefore, if a verb has two args V A1 A2, the _args_ list will # contain [A2, A1] because A2 is shallower (further from the head) than A1. # We reverse the list of args, so that args are re-attached from the inside out (starting from A1). # args.reverse() new_g = g for arg in args: new_g = Node(new_g.tag, [new_g, arg], new_g.category.left, head_index=0) arg.parent = new_g replace_kid(g.parent, g, new_g)
def fix_whword_topicalisation(self, node, p, s, t): debug('Fixing wh-word topicalisation: node: %s', lrp_repr(node)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid( p, t, Node(base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, SbNP, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) expr = r'*=PP < { /VP/=P < { /NP-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } }' % index for top, ctx in find_all(p, expr, with_context=True): replace_kid(ctx.pp, ctx.p, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def fix_topicalisation_with_gap(self, node, p, s, t): debug("Fixing topicalisation with gap:\nnode=%s\ns=%s\nt=%s", lrp_repr(node), pprint(s), pprint(t)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid( p, t, Node(base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, S, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) # attested gaps: # 575 IP-TPC:t # 134 NP-TPC:t # 10 IP-Q-TPC:t # 8 CP-TPC:t # 4 NP-PN-TPC:t # 2 QP-TPC:t # 2 NP-TTL-TPC:t # 1 PP-TPC:t # 1 IP-IJ-TPC:t # 1 INTJ-TPC:t # 1 CP-Q-TPC:t # 1 CP-CND-TPC:t expr = r'/IP/=TOP << { *=PP < { *=P < { /[NICQP]P-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } } }' % index for top, ctx in find_all(s, expr, with_context=True): debug('top: %s', pprint(top)) self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def fix_categories_starting_from(self, node, until): '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct CCG analysis.''' while node is not until: # Only fix binary rules if (not node.parent) or node.parent.count() < 2: break l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) debug("L: %s R: %s P: %s", L, R, P) applied_rule = analyse(L, R, P) debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R, ''.join(r.text()), P, ''.join(p.text()), applied_rule) if applied_rule is None: debug("invalid rule %s %s -> %s", L, R, P) if R.is_complex() and R.left.is_complex( ) and L == R.left.right: # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # conj R -> P # Make P into R[conj] # L cannot be the comma category (,), otherwise we get a mis-analysis # in 2:22(5) if str(L) in ('conj', 'LCM'): p.category = R.clone_adding_feature('conj') debug("New category: %s", p.category) # L R[conj] -> P elif R.has_feature('conj'): new_L = L.clone() r.category = new_L.clone_adding_feature('conj') p.category = new_L debug("New category: %s", new_L) elif L.is_leaf(): # , R -> P[conj] becomes , R -> R[conj] if P.has_feature('conj') and l.tag in ( 'PU', 'CC'): # treat as partial coordination debug("Fixing coordination: %s" % P) p.category = r.category.clone_adding_feature('conj') debug("new parent category: %s" % p.category) # , R -> P becomes , R -> R elif l.tag == "PU" and not P.has_feature( 'conj'): # treat as absorption debug("Fixing left absorption: %s" % P) p.category = r.category # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y elif R.is_complex() and R.left.is_complex( ) and L == R.left.right: T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif R.is_leaf(): # R , -> P becomes R , -> R if r.tag == "PU": # treat as absorption debug("Fixing right absorption: %s" % P) p.category = l.category # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) else: new_parent_category = None # try typeraising fix # T/(T/X) (T\A)/X -> T can be fixed: # (T\A)/((T\A)/X) (T\A)/X -> T\A if self.is_topicalisation(L) and (L.right.right == R.right and P == L.left and P == R.left.left): T_A = R.left X = R.right l.category = T_A / (T_A / X) new_parent_category = T_A # (X|X)|Z Y -> X becomes # (X|X)|Z X|(X|X) -> X|Z elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise( R, R, TR_BACKWARD, strip_features=False) #T/(T|L) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # Generalise over right modifiers of verbal categories (S[dcl]\X)$ elif self.is_verbal_category( L) and L.is_complex() and L.left.is_complex(): T = L.left.right new_category = typeraise(R, T, TR_BACKWARD) debug('Trying out %s', new_category) if bxcomp(L, new_category): node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) # Last ditch: try all of the composition rules to generalise over L R -> P if not new_parent_category: # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3)) # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category new_parent_category = ( fcomp(L, R) or bcomp(L, R, when=not self.is_relativiser(R)) or bxcomp( L, R, when=not self.is_relativiser(R) ) #or bxcomp2(L, R, when=self.is_verbal_category(L)) or fxcomp(L, R, when=not self.is_relativiser(R))) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category else: debug("couldn't fix, skipping") node = node.parent debug('')
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count()-1: quoted_kids = node.kids[lqu:rqu+1] del node.kids[lqu:rqu+1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index-1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index-1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root