def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if is_np_internal_structure(node): all_leaves = list(leaves(node)) node.kids = all_leaves self.write_derivation(bundle)
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue if node.count() == 1 and node[0].is_leaf(): self.unary[ self.signature(node) ] += 1 self.freqs[ self.signature(node) ] += 1
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.tag.startswith('IP-APP'): sibling = node.parent[1] head = self.get_head(sibling) self.headfreqs[head] += 1
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue if node.count() == 1 and node[0].is_leaf(): self.unary[self.signature(node)] += 1 self.freqs[self.signature(node)] += 1
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if not node.is_leaf(): self.counts['total'] += 1 kid_tags = ' '.join(kid.tag for kid in node) if is_predication(node): self.counts['predication'] += 1 self.predication_kinds[kid_tags] += 1 elif is_coordination(node): # coordination self.counts['coordination'] += 1 self.coordination_kinds[kid_tags] += 1 elif is_internal_structure(node): self.counts['structure'] += 1 elif node[0].is_leaf(): # head initial complementation self.counts['head-initial'] += 1 elif node[-1].is_leaf(): # head final complementation self.counts['head-final'] += 1 elif is_apposition(node): self.counts['apposition'] += 1 self.apposition_kinds[kid_tags] += 1 elif is_modification(node): self.counts['modification'] += 1 self.modification_kinds[kid_tags] += 1 else: # adjunction self.counts['adjunction'] += 1 self.adjunction_kinds[kid_tags] += 1
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue l, p = node[0].cat, node.cat r = node[1].cat if node.count() > 1 else None self.counts[ tuple(n for n in (l, r, p)) ] += 1
def accept(self, root): for node in nodes(root): cat = node.cat if has_bad_subcat(cat): self.bad_freqs[str(cat)] += 1 return False return True
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue if is_coordination(node): ccs = list(where(lambda kid: kid.tag == 'CC', node.kids)) for cc in ccs: self.conjs[base_tag(node.tag)][cc.lex] += 1 self.inverse[cc.lex][base_tag(node.tag)] += 1
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue l, p = node[0].cat, node.cat r = node[1].cat if node.count() > 1 else None self.counts[tuple(n for n in (l, r, p))] += 1
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue if is_coordination(node): def get_tag(kid): if kid.tag in ('CC', 'PU'): return kid.lex else: return kid.tag print ' '.join(get_tag(kid) for kid in node)
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue self.total += 1 result = analyse(node.lch.cat, node.rch and node.rch.cat, node.cat) if result == 'l_punct_absorb': self.l += 1 elif result == 'r_punct_absorb': self.r += 1 else: self.other += 1
def accept_derivation(self, bundle): global merge_verb_compounds if merge_verb_compounds: for node in nodes(bundle.derivation): if node.tag in self.MergedTags: replace_kid(node.parent, node, Leaf(node.tag, ''.join(kid.lex for kid in leaves(node)), node.parent)) if normalise_foreign_names: for leaf in leaves(bundle.derivation): if self.is_candidate_foreign_name(leaf.lex): kids = [ Leaf(leaf.tag, bit, None) for bit in leaf.lex.split(INTERPUNCT) ] replace_kid(leaf.parent, leaf, Node('NP-PN', kids)) if self.accept(bundle.derivation): self.write_derivation(bundle)
def postprocess(root): use_lcp_to_np = config.lcp_to_np_typechange for node in nodes(root): # Exclude the conjuncts in LCP coordination: we want the LCP->NP promotion to apply once to the result of the coordination if use_lcp_to_np and node.tag.startswith('LCP') and has_tags(node, 'lr'): # if we're in LCP coordination then we want to protect the conjuncts from being converted new_node = Node('NP', [node]) node.parent = new_node inherit_tag(new_node, node) replace_kid(node.parent, node, new_node) return root
def postprocess(root): use_lcp_to_np = config.lcp_to_np_typechange for node in nodes(root): # Exclude the conjuncts in LCP coordination: we want the LCP->NP promotion to apply once to the result of the coordination if use_lcp_to_np and node.tag.startswith('LCP') and has_tags( node, 'lr'): # if we're in LCP coordination then we want to protect the conjuncts from being converted new_node = Node('NP', [node]) node.parent = new_node inherit_tag(new_node, node) replace_kid(node.parent, node, new_node) return root
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue if node.count() == 1: t = int(node.head_index), str(node.cat) key = str(node[0].cat) self.unary[key].add(t) self.unary_freqs[(key, t)] += 1 else: t = int(node.head_index), str(node.cat) key = (str(node[0].cat), str(node[1].cat)) self.binary[key].add(t) self.binary_freqs[(key, t)] += 1
def accept_derivation(self, bundle): def kids_have_same_tag(node): def tags_are_equal(t1, t2): if t1[0] == 'V' and t2[0] == 'V': return True if t1[0] == 'N' and t2[0] == 'N': return True return t1 == t2 return all(tags_are_equal(node[0].tag, other.tag) for other in node[1:]) self.nderivs += 1 for node in nodes(bundle.derivation): if (node.count() > 1 and (not node.tag.startswith('NP')) and (not node.tag.startswith('ADJP')) and (not node.tag.startswith('FRAG')) and (not node.tag.startswith('FLR')) and (not base_tag(node.tag) in ('VCD', 'VRD', 'VCP', 'VNV', 'VPT', 'VSB')) and (not kids_have_same_tag(node)) and all(base_tag(kid.tag) in WordTags for kid in node)): self.nbad += 1 print node break
def multi_tgrep(deriv, query_callback_map): if not query_callback_map: raise RuntimeError('No query expressions given.') initialise() if _tgrep_debug: for expression in query_callback_map.keys(): debug("Lexing %s", expression) lex.input(expression) for tok in iter(lex.token, None): debug("\t%s %s", tok.type, tok.value) queries = [yacc.parse(expression) for expression in query_callback_map.keys()] for node in nodes(deriv): for query_expr, query_str in izip(queries, query_callback_map.keys()): context = Context() if query_expr.is_satisfied_by(node, context): if context: query_callback_map[query_str](node, **smash_key_case(context)) else: query_callback_map[query_str](node)
def multi_tgrep(deriv, query_callback_map): if not query_callback_map: raise RuntimeError('No query expressions given.') initialise() if _tgrep_debug: for expression in query_callback_map.keys(): debug("Lexing %s", expression) lex.input(expression) for tok in iter(lex.token, None): debug("\t%s %s", tok.type, tok.value) queries = [ yacc.parse(expression) for expression in query_callback_map.keys() ] for node in nodes(deriv): for query_expr, query_str in izip(queries, query_callback_map.keys()): context = Context() if query_expr.is_satisfied_by(node, context): if context: query_callback_map[query_str](node, **smash_key_case(context)) else: query_callback_map[query_str](node)
total, with_unrecognised_rules = 0, 0 ucp_rules = defaultdict(lambda: 0) with_ucp = 0 unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0) def is_ucp(l, r, p): if r is None: return False return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r for file in glob(sys.argv[1]): for bundle in CCGbankReader(file): has_unrecognised_rules, has_ucp = False, False for node in nodes(bundle.derivation): if node.is_leaf(): continue lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node)) comb = analyse(*lrp) l, r, p = lrp rule_tuple = (str(l), str(r), str(p)) if comb: combs[comb] += 1 elif is_ucp(*lrp): ucp_rules[rule_tuple] += 1 if not has_ucp: with_ucp += 1 has_ucp = True
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue self.branches += node.count() self.internals += 1
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.count() == 1: self.unaries += 1 self.nsents += 1
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith( 'PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that( lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that( lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count() - 1: quoted_kids = node.kids[lqu:rqu + 1] del node.kids[lqu:rqu + 1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any( is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index - 1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index - 1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count( ) == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any( kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count( ) == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith( 'NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count( ) == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[ 2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count( ) == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first( node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first( node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root
def preprocess(root): # IP < PP PU -> PP < PP PU (20:58(1)) if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag for node in nodes(root): if node.is_leaf(): continue if rewrite_lcp_as_np and node.tag.startswith('LCP'): node.tag = node.tag.replace('LCP', 'NP') first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) # --------------------- # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP) if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)): lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node) rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node) if rqu != node.count()-1: quoted_kids = node.kids[lqu:rqu+1] del node.kids[lqu:rqu+1] last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True) # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore if last_nonpunct_kid: quoted_node = Node(last_nonpunct_kid.tag, quoted_kids) node.kids.insert(lqu, quoted_node) # CPTB/Chinese-specific fixes # --------------------------- # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP)) if first_kid and first_kid.tag == "P" and node.count() > 2: last_tag = last_kid.tag rest = node.kids[1:] del node.kids[1:] node.kids.append(Node(last_tag, rest, node)) # 2:12(3). DNP-PRD fixed by adding a layer of NP elif (node.tag.startswith('VP') and node.count() == 2 and node[0].tag.startswith('VC') and node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node) # fix missing -OBJ tag from VP object complements (c.f. 31:18(4)) elif (node.tag.startswith('VP') and node.count() >= 2 and node.tag.startswith('VP') and node[0].tag == 'VV' and node[-1].tag == 'NP'): node[-1].tag += "-OBJ" # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5)) elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')): node.kids = node[0].kids # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier, # just like a quantity elif node.tag == 'DP-SBJ': node.tag = 'QP-SBJ' # attach the PU preceding a PRN under the PRN elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1: maybe_pu = node[last_kid_index-1] if maybe_pu.tag == 'PU': del node.kids[last_kid_index-1] last_kid.kids.insert(0, maybe_pu) # prepend # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC. elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG': if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'): node[1].tag = 'DEC' elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node): for kid in node: if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP') # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'): np_app, np_pn = node[1], node[2] del node.kids[1:] node.kids.append(Node(node.tag, [np_app, np_pn], node)) # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19)) elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP': advp = node.kids.pop(1) # VP is the new node[1] # now replace node[1] with Node(node[1]) node[1] = Node(node[1].tag, [advp, node[1]], node) # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h) # only 3 cases: 23:61(5), 9:14(14), 21:3(11) elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG': replace_kid(node, node[0], Node('NP', [node[0]])) elif is_vnv(node) and node.count() == 3: # Re-analyse VNV as coordination node[1].tag = 'CC' # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1)) elif node.count() == 1: # fix IP < VP by adding *pro* if node.tag.startswith('IP') and node[0].tag.startswith('VP'): leaf = Leaf('-NONE-', '*pro*', None) pro = Node('NP-SBJ', [leaf]) node.kids.insert(0, pro) elif node[0].tag == 'JJ': if node.tag.startswith('ADVP'): node.tag = node.tag.replace('ADVP', 'ADJP') elif node.tag.startswith('NP'): node.tag = node.tag.replace('NP', 'ADJP') # fix NP < VV elif node.tag == 'NP' and node[0].tag == 'VV': node.tag = node.tag.replace('NP', 'VP') # fix NP < ADJP < JJ (5:35(1)) elif node.tag == 'NP' and node[0].tag == 'ADJP': replace_kid(node.parent, node, node[0]) # fix projections NP < QP elif node[0].tag.startswith('QP') and node.tag.startswith('NP'): inherit_tag(node[0], node) # copy PCTB tags from NP to QP node.tag = node[0].tag # copy QP to parent, replacing NP node.kids = node[0].kids elif node[0].tag == 'IP' and node.tag == 'CP-APP': inherit_tag(node[0], node) node.tag = node[0].tag node.kids = node[0].kids # CLP < NN elif node[0].tag == 'NN' and node.tag == 'CLP': node[0].tag = 'M' elif node[0].tag == 'NN' and node.tag.startswith("VP"): node[0].tag = 'VV' elif node[0].tag == 'CP': if node.tag == 'NP-PRD': node.kids = node[0].kids else: # Rewrite NP < { CP < { CP < DEC } } # (i.e. 比 报告 的 早 一点) so that it's headed by the 的 expr = r'''/CP/ < { /CP/ < /DEC/ }''' if get_first(node[0], expr): node.kids = node[0].kids elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN': node.kids = node[0].kids # ADVP < CS: shrink so that CS will be considered the head by binarise # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47) elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or (node[0].tag == 'M' and node.tag == 'CP')): replace_kid(node.parent, node, node[0]) # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4) elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'): node.kids = node[0].kids # elif node.tag == 'VP' and node[0].tag == 'NP-PRD': # replace_kid(node.parent, node, node[0]) # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3) # elif node.tag == 'VP' and node[0].tag.startswith('IP'): # replace_kid(node.parent, node, node[0]) # Reshape LB (long bei) # --------------------- elif first_kid and first_kid.tag == "LB": expr = r'''* < { /LB/=LB [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }''' top, ctx = get_first(node, expr, with_context=True) lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip top.kids = [lb, Node('IP', [sbj, pred])] # top.kids = [lb, sbj, pred] # elif False: elif first_kid and first_kid.tag == "BA": expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }''' result = get_first(node, expr, with_context=True) if result: top, ctx = result lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip # top.kids = [lb, Node('IP', [sbj, pred])] top.kids = [lb, sbj, pred] # single mistagging CP-SBJ for CP in 24:58(1) elif node.tag == 'CP-SBJ': node.tag = 'CP' else: # Fix missing phrasal layer in NP < NN DEG (21:10(4)) result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix missing phrasal layer in LCP < NN LC (11:17(9)) result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True) if result: p, ctx = result n = ctx.n replace_kid(p, n, Node('NP', [n])) # Fix wrongly attached DEC (5:26(6)) result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True) if result: _, ctx = result top, p, dec = ctx.top, ctx.p, ctx.dec top.kids.append(dec) p.kids.remove(dec) result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True) if result: _, ctx = result pp, p, s = ctx.pp, ctx.p, ctx.s inherit_tag(s, p) replace_kid(pp, p, s) expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }''' result = get_first(node, expr, with_context=True) if result: _, ctx = result vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred del vp.kids if get_first(sbj, r'* < ^/\*PRO\*/'): vp.kids = [v, pred] else: vp.kids = [v, sbj, pred] expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/''' result = get_first(node, expr, with_context=True) if result: _, ctx = result p = ctx.p if p.count() <= 3: continue cd_cc_cd, rest = p.kids[0:3], p.kids[3:] del p.kids[0:3] new_node = Node('QP', cd_cc_cd) p.kids.insert(0, new_node) return root
with_ucp = 0 unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0) def is_ucp(l, r, p): if r is None: return False return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r for file in glob(sys.argv[1]): for bundle in CCGbankReader(file): has_unrecognised_rules, has_ucp = False, False for node in nodes(bundle.derivation): if node.is_leaf(): continue lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node)) comb = analyse(*lrp) l, r, p = lrp rule_tuple = (str(l), str(r), str(p)) if comb: combs[comb] += 1 elif is_ucp(*lrp): ucp_rules[rule_tuple] += 1 if not has_ucp: with_ucp += 1
def Dominates(candidate, node, context): if node.is_leaf(): return False return any(candidate.is_satisfied_by(internal_node, context) for internal_node in nodes(node))
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): self.atoms.update(ListAtoms.get_atoms(node.cat))
def label(root): root = preprocess(root) for node in nodes(root): if node.is_leaf(): continue at_top = False if node.parent is None: at_top = True first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) if first_kid is None: continue for kid in node: if has_modification_tag(kid): tag(kid, 'm') elif kid.tag == 'MSP': tag(kid, 'a') elif kid.tag == 'FLR': tag(kid, 'a') elif kid.tag == 'ETC': tag(kid, '&') else: tag_if_topicalisation(kid) if is_prn(node): # PRN tagging error in 10:49(69) if not first_kid: continue node.tag = first_kid.tag tag(node, 'p') tag(node[0], 'h') # assume that the first PU introduces the PRN elif node.tag == 'FRAG': tag_adjunction(node, last_kid) # occasionally something that looks like CCG right absorption occurs in the original annotation (0:23(8)) elif is_right_absorption(node): pass elif is_predication(node): sbj_assigned = False vp_assigned = False for kid in reversed(node): if not sbj_assigned and (kid.tag.rfind('-SBJ') != -1 or # TODO: we can get IP < NP-PN VP (0:40(5)). is this correct? # exclude NP-PN-LOC (10:62(25)) (kid.tag.rfind('-PN') != -1 and kid.tag.rfind('-PN-LOC') == -1) or # NP-APP VP in 11:31(88) (kid.tag.rfind('-APP') != -1) or kid.tag == "NP"): tag(kid, 'l') # TODO: is subject always left of predicate? sbj_assigned = True elif not vp_assigned and kid.tag == 'VP': tag(kid, 'h') vp_assigned = True # elif _has_modification_tag(kid) and kid.tag.startswith('IP'): # tag(kid, 'm') elif kid.tag not in ('PU', 'CC'): tag(kid, 'a') if punct_cued_typechange: for i, kid in enumerate(node): try: # exclude IP-SBJ PU VP from having the PU tagged :h (1:53(9)) if (kid.tag.startswith('IP-') or kid.tag.startswith('CP-')) and \ kid.tag.find('-TPC') == -1 and \ kid.tag.find('-SBJ') == -1 and \ node[i+1].tag == 'PU': tag(node[i+1], 'h') except: continue elif node.count() == 1 and node.tag.startswith('VP') and is_verb_compound(node[0]): pass elif is_vpt(node): # fen de kai, da bu ying. vpt is head-final left = True for kid in node: if kid.tag.startswith("AD") or kid.tag.startswith("DER"): tag(kid, 'h') left = False elif left: tag(kid, 'l') else: tag(kid, 'r') elif is_vsb(node): # VSB is modifier+head, and hence is head-final tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_vcd(node): pass elif is_vcp(node): tag(first_kid, 'h') for kid in node[1:]: if kid.tag == "VC": tag(kid, 'a') elif is_vrd(node) or is_vsb(node): # vrd is head-initial tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_coordination(node, at_top=at_top): # coordination for kid in node: if kid.tag == "ETC": tag(kid, '&') if kid.tag not in ('CC', 'PU', 'CSC'): tag(kid, 'c') elif is_np_internal_structure(node): first = True for kid in reversed(node.kids): # if kid.tag.startswith('PRN'): continue if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): if first: tag(kid, 'N') first = False else: tag(kid, 'n') else: pass # must be above is_coordination (it subsumes UCP) elif is_ucp(node): left_conjunct_tag = first_kid.tag # NOTE: # There are some cases where the UCP annotation is suspect (1:36(11)) # we will obtain the wrong analysis in these cases because the UCP node # does not directly dominate its conjuncts old_tag = node.tag node.tag = left_conjunct_tag node.tag = inherit_tag_str(node.tag, old_tag) for kid in nodes(node): if kid.tag is None: print kid print kid.tag if kid.tag.startswith('UCP'): kid.tag = left_conjunct_tag for kid in node: if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): tag(kid, 'C') # exclude VP < VV AS: we want to tag this elif (node.count() == 1) or is_verb_compound(node): pass elif ((first_kid.is_leaf() # head initial complementation # quoted verb (see fix in _preprocess_ function) # or all((kid.is_leaf() and kid.tag in ('PU', 'VV')) for kid in first_kid) or satisfies_all( lambda fkid: any(kid.is_leaf() and kid.tag == 'PU' for kid in fkid), lambda fkid: any(kid.is_leaf() and kid.tag == 'VV' for kid in fkid))(first_kid) or is_verb_compound(first_kid) # HACK: to fix weird case of unary PP < P causing adjunction analysis instead of head-initial # (because of PP IP configuration) in 10:76(4) or first_kid.tag == 'PP' and first_kid.count() == 1 and first_kid[0].tag == "P")): tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') # head final complementation # This has to come after head initial complementation, otherwise we get the tagging VP < VV:l AS:h elif (last_kid.is_leaf() or is_verb_compound(last_kid) or # lcp internal structure (cf 10:2(13)) is possible: despite the structure (LCP (NP) (LCP)) # this should be treated as head-final complementation, not adjunction. is_lcp_internal_structure(last_kid)) and not any(first_kid.tag.find(ftag) != -1 for ftag in FunctionTags): if last_kid.tag.startswith('SP'): # Treat final 吗 as the head to get the type-change category S[q]\S[dcl] (25:21(5)) if has_question_tag(node): tag(last_kid, 'h') else: tag(last_kid, 'a') else: tag(last_kid, 'h') # cf 2:23(7),1:9(28), a number of derivations have (CP(WHNP-1 CP(IP) DEC) XP) instead of # the expected (CP (WHNP-1) CP(IP DEC) XP) # This lets us treat what would otherwise be considered head-final as an # adjunction if last_kid.tag.startswith('DEC'): for kid in node[0:-1]: if kid.tag.startswith('WHNP') or kid.tag.startswith('WHPP'): tag(kid, 'a') elif not (kid.tag.startswith('PU') or kid.tag.startswith('ADVP')): # ADVP as sibling of IP in 11:39(63) tag(kid, 'l') else: for kid in node[0:-1]: if (last_kid.tag in VerbalCategories and ( is_postverbal_adjunct_tag(kid.tag) or # exception added to account for direct modification of V{V,A} with ADVP (0:47(9)) kid.tag.startswith('ADVP'))): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'l') # TODO: if this is below coordination, then NP(NP-APP PU NP) is considered coordination instead of apposition (10:70(15)) # actually, what happens if we get english-style apposition (NP1 , NP2)? elif is_apposition(node): if False:#any(kid.tag == 'IP-APP' for kid in node): tag(last_kid, 'h') else: tag(last_kid, 'r') # HACK: assume apposition is right-headed for kid in node: if not kid.tag.startswith('PU'): # exclude CP-APP (see is_apposition() above) if kid.tag.endswith('-APP') and not kid.tag.startswith('CP'): tag(kid, 'A') else: tag(kid, 'a') elif is_modification(node): tag(last_kid, 'h') for kid in node[0:-1]: if has_modification_tag(kid): tag(kid, 'm') elif not kid.tag.startswith('PU'): tag(kid, 'a') else: tag_if_topicalisation(kid) elif is_argument_cluster(node): for kid in node: tag(kid, '@') else: # adjunction tag_adjunction(node, last_kid) root = postprocess(root) return root
def label(root): root = preprocess(root) for node in nodes(root): if node.is_leaf(): continue at_top = False if node.parent is None: at_top = True first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False) last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True) if first_kid is None: continue for kid in node: if has_modification_tag(kid): tag(kid, 'm') elif kid.tag == 'MSP': tag(kid, 'a') elif kid.tag == 'FLR': tag(kid, 'a') elif kid.tag == 'ETC': tag(kid, '&') else: tag_if_topicalisation(kid) if is_prn(node): # PRN tagging error in 10:49(69) if not first_kid: continue node.tag = first_kid.tag tag(node, 'p') tag(node[0], 'h') # assume that the first PU introduces the PRN elif node.tag == 'FRAG': tag_adjunction(node, last_kid) # occasionally something that looks like CCG right absorption occurs in the original annotation (0:23(8)) elif is_right_absorption(node): pass elif is_predication(node): sbj_assigned = False vp_assigned = False for kid in reversed(node): if not sbj_assigned and ( kid.tag.rfind('-SBJ') != -1 or # TODO: we can get IP < NP-PN VP (0:40(5)). is this correct? # exclude NP-PN-LOC (10:62(25)) (kid.tag.rfind('-PN') != -1 and kid.tag.rfind('-PN-LOC') == -1) or # NP-APP VP in 11:31(88) (kid.tag.rfind('-APP') != -1) or kid.tag == "NP"): tag(kid, 'l') # TODO: is subject always left of predicate? sbj_assigned = True elif not vp_assigned and kid.tag == 'VP': tag(kid, 'h') vp_assigned = True # elif _has_modification_tag(kid) and kid.tag.startswith('IP'): # tag(kid, 'm') elif kid.tag not in ('PU', 'CC'): tag(kid, 'a') if punct_cued_typechange: for i, kid in enumerate(node): try: # exclude IP-SBJ PU VP from having the PU tagged :h (1:53(9)) if (kid.tag.startswith('IP-') or kid.tag.startswith('CP-')) and \ kid.tag.find('-TPC') == -1 and \ kid.tag.find('-SBJ') == -1 and \ node[i+1].tag == 'PU': tag(node[i + 1], 'h') except: continue elif node.count() == 1 and node.tag.startswith( 'VP') and is_verb_compound(node[0]): pass elif is_vpt(node): # fen de kai, da bu ying. vpt is head-final left = True for kid in node: if kid.tag.startswith("AD") or kid.tag.startswith("DER"): tag(kid, 'h') left = False elif left: tag(kid, 'l') else: tag(kid, 'r') elif is_vsb(node): # VSB is modifier+head, and hence is head-final tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag( kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_vcd(node): pass elif is_vcp(node): tag(first_kid, 'h') for kid in node[1:]: if kid.tag == "VC": tag(kid, 'a') elif is_vrd(node) or is_vsb(node): # vrd is head-initial tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag( kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') elif is_coordination(node, at_top=at_top): # coordination for kid in node: if kid.tag == "ETC": tag(kid, '&') if kid.tag not in ('CC', 'PU', 'CSC'): tag(kid, 'c') elif is_np_internal_structure(node): first = True for kid in reversed(node.kids): # if kid.tag.startswith('PRN'): continue if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): if first: tag(kid, 'N') first = False else: tag(kid, 'n') else: pass # must be above is_coordination (it subsumes UCP) elif is_ucp(node): left_conjunct_tag = first_kid.tag # NOTE: # There are some cases where the UCP annotation is suspect (1:36(11)) # we will obtain the wrong analysis in these cases because the UCP node # does not directly dominate its conjuncts old_tag = node.tag node.tag = left_conjunct_tag node.tag = inherit_tag_str(node.tag, old_tag) for kid in nodes(node): if kid.tag is None: print kid print kid.tag if kid.tag.startswith('UCP'): kid.tag = left_conjunct_tag for kid in node: if kid.tag == 'ETC': tag(kid, '&') elif kid.tag not in ('CC', 'PU'): tag(kid, 'C') # exclude VP < VV AS: we want to tag this elif (node.count() == 1) or is_verb_compound(node): pass elif (( first_kid.is_leaf() # head initial complementation # quoted verb (see fix in _preprocess_ function) # or all((kid.is_leaf() and kid.tag in ('PU', 'VV')) for kid in first_kid) or satisfies_all( lambda fkid: any(kid.is_leaf() and kid.tag == 'PU' for kid in fkid), lambda fkid: any(kid.is_leaf() and kid.tag == 'VV' for kid in fkid))(first_kid) or is_verb_compound(first_kid) # HACK: to fix weird case of unary PP < P causing adjunction analysis instead of head-initial # (because of PP IP configuration) in 10:76(4) or first_kid.tag == 'PP' and first_kid.count() == 1 and first_kid[0].tag == "P")): tag(first_kid, 'h') for kid in node[1:]: if is_postverbal_adjunct_tag( kid.tag) or kid.tag.startswith('ADVP'): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'r') # head final complementation # This has to come after head initial complementation, otherwise we get the tagging VP < VV:l AS:h elif (last_kid.is_leaf() or is_verb_compound(last_kid) or # lcp internal structure (cf 10:2(13)) is possible: despite the structure (LCP (NP) (LCP)) # this should be treated as head-final complementation, not adjunction. is_lcp_internal_structure(last_kid)) and not any( first_kid.tag.find(ftag) != -1 for ftag in FunctionTags): if last_kid.tag.startswith('SP'): # Treat final 吗 as the head to get the type-change category S[q]\S[dcl] (25:21(5)) if has_question_tag(node): tag(last_kid, 'h') else: tag(last_kid, 'a') else: tag(last_kid, 'h') # cf 2:23(7),1:9(28), a number of derivations have (CP(WHNP-1 CP(IP) DEC) XP) instead of # the expected (CP (WHNP-1) CP(IP DEC) XP) # This lets us treat what would otherwise be considered head-final as an # adjunction if last_kid.tag.startswith('DEC'): for kid in node[0:-1]: if kid.tag.startswith('WHNP') or kid.tag.startswith( 'WHPP'): tag(kid, 'a') elif not (kid.tag.startswith('PU') or kid.tag.startswith('ADVP') ): # ADVP as sibling of IP in 11:39(63) tag(kid, 'l') else: for kid in node[0:-1]: if (last_kid.tag in VerbalCategories and (is_postverbal_adjunct_tag(kid.tag) or # exception added to account for direct modification of V{V,A} with ADVP (0:47(9)) kid.tag.startswith('ADVP'))): tag(kid, 'a') # treat aspect particles as adjuncts elif not kid.tag.startswith('PU'): tag(kid, 'l') # TODO: if this is below coordination, then NP(NP-APP PU NP) is considered coordination instead of apposition (10:70(15)) # actually, what happens if we get english-style apposition (NP1 , NP2)? elif is_apposition(node): if False: #any(kid.tag == 'IP-APP' for kid in node): tag(last_kid, 'h') else: tag(last_kid, 'r') # HACK: assume apposition is right-headed for kid in node: if not kid.tag.startswith('PU'): # exclude CP-APP (see is_apposition() above) if kid.tag.endswith( '-APP') and not kid.tag.startswith('CP'): tag(kid, 'A') else: tag(kid, 'a') elif is_modification(node): tag(last_kid, 'h') for kid in node[0:-1]: if has_modification_tag(kid): tag(kid, 'm') elif not kid.tag.startswith('PU'): tag(kid, 'a') else: tag_if_topicalisation(kid) elif is_argument_cluster(node): for kid in node: tag(kid, '@') else: # adjunction tag_adjunction(node, last_kid) root = postprocess(root) return root