def do_fix(C, node): def bxcomp(L, R, P): A = featureless(L.left) return A | A def bxcomp2(L, R, P): A = featureless(L.left.left) return A | A def innermost_VP(c): while c.left.is_complex(): c = c.left return c if node.parent and node.parent.count() > 1: l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) if p.tag.startswith("VSB"): v = innermost_VP(L) l.category = v / v elif is_modifier_category(R) and L.is_complex() and r is node: if C.is_bxcomp2_candidate(L, R, P): node.category = bxcomp2(L, R, P) debug("Generalised %s to %s", R, node.category) elif C.is_bxcomp_candidate(L, R, P): node.category = bxcomp(L, R, P) debug("Generalised %s to %s", R, node.category)
def clusterfix(self, top, pp, p, s, t): debug("Fixing argument cluster coordination: %s", pprint(top)) debug('T: %s', t) # 1. Shrink the verb (node T) self.fix_object_gap(pp, p, t, s) # 2. Reattach the verb above the TOP node new_node = Node('TAG', top.kids, top.category, head_index=0) top.kids = [t, new_node] # (Reattaching parent pointers) for kid in new_node: kid.parent = new_node # 3. Find and relabel argument clusters for node, ctx in find_all(top, r'/VP/=VP <1 /NP/=NP <2 /(QP|V[PV])/=QP', with_context=True): vp, np, qp = ctx.vp, ctx.np, ctx.qp # Now, VP should have category ((S[dcl]\NP)/QP)/NP SbNP = t.category.left.left QP, NP = qp.category, np.category # NP should have category ((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP) new_np_category = (SbNP / QP) | ((SbNP / QP) / NP) # QP should have category ((S[dcl]\NP)\((S[dcl]\NP)/QP)) new_qp_category = (SbNP) | ((SbNP) / QP) # insert unary nodes new_np_node = Node(np.tag, [np], new_np_category, head_index=0) np.parent = new_np_node new_qp_node = Node(qp.tag, [qp], new_qp_category, head_index=0) qp.parent = new_qp_node replace_kid(vp, np, new_np_node) replace_kid(vp, qp, new_qp_node) self.fix_categories_starting_from(new_np_node, top)
def output(self): appl_accepted, _ = self.appl_only_filter.compute_accepts_and_rejects() null_accepted, _ = self.null_only_filter.compute_accepts_and_rejects() # Start by collecting the manually annotated slashes from the file aggregate = self.parse_annoform(self.manual_fn) for (set, mode_name) in ((appl_accepted, 'apply'), (null_accepted, 'null')): for (cat_string, slash_index, applied_frequency, total_frequency) in \ sorted(set, key=lambda this: this[2], reverse=True): # aggregate[re.sub(r'[-*@.]', '', cat_string)][slash_index] = mode_name # If there is a slash-mode entry in _aggregate_ already (from the manual list), # do not overwrite it. slash_to_mode_map = aggregate[re.sub(r'[-*@.]', '', cat_string)] if slash_index not in slash_to_mode_map: debug("Slash %d of %s will have mode %s", slash_index, cat_string, mode_name) slash_to_mode_map[slash_index] = mode_name else: debug("Not overwriting slash %d of %s", slash_index, cat_string) for (category_string, slashes) in aggregate.iteritems(): for (slash_index, mode_name) in slashes.iteritems(): print " ".join((category_string, mode_name, str(slash_index)))
def fix_topicalisation_with_gap(self, node, p, s, t): debug("Fixing topicalisation with gap:\nnode=%s\ns=%s\nt=%s", lrp_repr(node), pprint(s), pprint(t)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid(p, t, Node( base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, S, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) # attested gaps: # 575 IP-TPC:t # 134 NP-TPC:t # 10 IP-Q-TPC:t # 8 CP-TPC:t # 4 NP-PN-TPC:t # 2 QP-TPC:t # 2 NP-TTL-TPC:t # 1 PP-TPC:t # 1 IP-IJ-TPC:t # 1 INTJ-TPC:t # 1 CP-Q-TPC:t # 1 CP-CND-TPC:t expr = r'/IP/=TOP << { *=PP < { *=P < { /[NICQP]P-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } } }' % index for top, ctx in find_all(s, expr, with_context=True): debug('top: %s', pprint(top)) self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def process_annotator_into_substs(self, fn): substs = {} slashes = defaultdict(set) with file(fn, 'r') as f: for lineno, line in enumerate(f): line = line.rstrip() fields = line.split() if len(fields) != 3: raise FilterException, ("Missing field at line %d of annotator file %s." % (lineno, self.anno_filename)) category_string, replacement_mode_string, slash_index = fields debug("Slash %s of %s goes to %s=%d", slash_index, re.sub(r'[-.*@]', '', category_string), replacement_mode_string,self.mode_string_to_index(replacement_mode_string)) slashes[re.sub(r'[-.*@]', '', category_string)].add( ( int(slash_index), self.mode_string_to_index(replacement_mode_string) )) for (category_string, replacements) in slashes.iteritems(): moded_category = parse_category(category_string) moded_category.labelled() for (subcategory, slash_index) in moded_category.slashes(): result = find(lambda (index, mode): index == slash_index, replacements) if result: replacement_slash, replacement_mode = result debug("Setting mode of slash %s of %s to %s", slash_index, moded_category, replacement_mode) subcategory.mode = replacement_mode substs[category_string] = moded_category return substs
def fix_long_bei_gap(self, node, bei, pred, top, n=None, reduced=False): debug("Fixing long bei gap: %s", lrp_repr(node)) if not reduced: self.remove_null_element(top) if n: index = get_trace_index_from_tag(n.tag) else: index = r'\*' expr = r'*=PP < { *=P < { /NP-(?:TPC|OBJ)/=T < ^/%s/a $ *=S } }' % index trace_NP, ctx = get_first(top, expr, with_context=True) pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) self.relabel_bei_category(top, pred) top.category = top[0].category.left debug("done %s", pprint(top))
def fix_cat_for(self, leaf, slash_index, mode): key_category = re.sub(r'[-.*@]', '', str(leaf.cat)) if not (key_category in self.permitted_cats): warn("No entry in splitdef file for category %s", leaf.cat) return alternatives = self.permitted_cats[key_category] #print "All alternatives: %s" % alternatives old_modes = self.modes_for_cat(leaf.cat) def is_invalid_alternative(alt): alt_modes = self.modes_for_cat(alt) if len(alt_modes) != len(old_modes): warn("Replacement category %s has different size to original category %s", alt, leaf.cat) modes_for_comparison = zip(alt_modes, old_modes) del modes_for_comparison[slash_index] return str(leaf.cat) == str(alt) or \ any((ModeTier[alt] < ModeTier[old]) for (alt, old) in modes_for_comparison) valids = list(reject(alternatives, is_invalid_alternative)) if not valids: warn("No valid alternative for %s which preserves mode `%s' on slash %d", leaf.cat, mode, slash_index) return #print "Alternatives: %s" % valids alternative = min(valids, key=lambda e: self.permissiveness(e, slash_index)) debug("%s `%s' -> %s", leaf.cat, leaf.lex, alternative) leaf.cat = alternative
def clusterfix(self, top, pp, p, s, t): debug("Fixing argument cluster coordination: %s", pprint(top)) debug('T: %s', t) # 1. Shrink the verb (node T) self.fix_object_gap(pp, p, t, s) # 2. Reattach the verb above the TOP node new_node = Node('TAG', top.kids, top.category, head_index=0) top.kids = [t, new_node] # (Reattaching parent pointers) for kid in new_node: kid.parent = new_node # 3. Find and relabel argument clusters for node, ctx in find_all(top, r'/VP/=VP <1 /NP/=NP <2 /(QP|V[PV])/=QP', with_context=True): vp, np, qp = ctx.vp, ctx.np, ctx.qp # Now, VP should have category ((S[dcl]\NP)/QP)/NP SbNP = t.category.left.left QP, NP = qp.category, np.category # NP should have category ((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP) new_np_category = (SbNP/QP)|((SbNP/QP)/NP) # QP should have category ((S[dcl]\NP)\((S[dcl]\NP)/QP)) new_qp_category = (SbNP)|((SbNP)/QP) # insert unary nodes new_np_node = Node(np.tag, [np], new_np_category, head_index=0); np.parent = new_np_node new_qp_node = Node(qp.tag, [qp], new_qp_category, head_index=0); qp.parent = new_qp_node replace_kid(vp, np, new_np_node) replace_kid(vp, qp, new_qp_node) self.fix_categories_starting_from(new_np_node, top)
def do_fix(C, node): def bxcomp(L, R, P): A = featureless(L.left) return A | A def bxcomp2(L, R, P): A = featureless(L.left.left) return A | A def innermost_VP(c): while c.left.is_complex(): c = c.left return c if node.parent and node.parent.count() > 1: l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) if p.tag.startswith('VSB'): v = innermost_VP(L) l.category = v / v elif (is_modifier_category(R) and L.is_complex() and r is node): if C.is_bxcomp2_candidate(L, R, P): node.category = bxcomp2(L, R, P) debug("Generalised %s to %s", R, node.category) elif C.is_bxcomp_candidate(L, R, P): node.category = bxcomp(L, R, P) debug("Generalised %s to %s", R, node.category)
def fix_ip_app(self, p, a, s): debug("Fixing IP-APP NX: %s", lrp_repr(p)) new_kid = copy(a) new_kid.tag = base_tag( new_kid.tag) # relabel to stop infinite matching replace_kid( p, a, Node("NN", [new_kid], s.category / s.category, head_index=0))
def fix_short_bei_obj_gap(self, node, pp, bei, beis, t, p, s): debug("fixing short bei object gap: pp:%s\np:%s\ns:%s", lrp_repr(pp), lrp_repr(p), lrp_repr(s)) # simple test case in 29:71(3) for bei with extracted NP replace_kid(pp, p, s) self.fix_categories_starting_from(s, until=bei.parent[1]) bei.category = bei.category.clone_with(right=bei.parent[1].category)
def fix_short_bei_io_gap(self, node, pp, bei, beis, t, p, s): debug("fixing short bei io gap: pp:%s\np:%s\ns:%s", lrp_repr(pp), lrp_repr(p), lrp_repr(s)) replace_kid(pp, p, s) self.fix_categories_starting_from(s, until=pp) bei.category = bei.category.clone_with(right=beis.category)
def fix_short_bei_subj_gap(self, node, bei, pp, p, t, s): debug("fixing short bei subject gap: %s", lrp_repr(pp)) # take the VP sibling of SB # replace T with S # this analysis isn't entirely correct replace_kid(pp, p, s) self.fix_categories_starting_from(s, pp) bei.category = bei.category.clone_with(right=bei.parent[1].category)
def fix_topicalisation_without_gap(self, node, p, s, t): debug("Fixing topicalisation without gap: %s", pprint(node)) new_kid = copy(t) new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False) new_category = featureless(p.category) / featureless(s.category) replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))
def fix_topicalisation_without_gap(self, node, p, s, t): debug("Fixing topicalisation without gap: %s", pprint(node)) new_kid = copy(t) new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False) new_category = featureless(p.category)/featureless(s.category) replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))
def label_partial_coordination(node, inside_np=False, ucp=False): node[0].category = ptb_to_cat(node[0]) node.kids[0] = label(node[0], inside_np) debug('label_partial_coordination node.category: %s, %s', node.category, node) node[1].category = ptb_to_cat(node[1]) if ucp else node.category node.kids[1] = label(node[1], inside_np) return node
def register_unary(unaries, node, filler): ''' If _node_ represents the result (RHS) of a unary rule, this records that a new dependency must be created between it and its filler, adding it to _unaries_, a list of such dependencies created in a given derivation. ''' node.cat.parg_labelled() node.cat.slot.head.lex = filler debug("%s head lex <- %s", node, filler) unaries.append(node)
def fix_object_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N node = n debug("Fixing object extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) # If we couldn't find the DEC node, this is the null relativiser case if not self.relabel_relativiser(pred): # TOP is the S node # null relativiser category comes from sibling of TOP # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9)) result = get_first(top, r'* $ *=SS', with_context=True, nonrecursive=True) if result: _, ctx = result ss = ctx.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def fix_modification(self, node, p, s, t): debug("Fixing modification: %s", lrp_repr(node)) S, P = s.category, p.category # If you don't strip the tag :m from the newly created child (new_kid), # the fix_modification pattern will match infinitely when tgrep visits new_kid new_kid = copy(t) new_kid.tag = base_tag(new_kid.tag, strip_cptb_tag=False) new_category = featureless(P) / featureless(S) debug("Creating category %s", new_category) replace_kid(p, t, Node(t.tag, [new_kid], new_category, head_index=0))
def relabel_bei_category(self, top, pred): # particle 'you' is tagged as a preposition but acts as the BEI marker bei, ctx = get_first(top, r'*=S [ $ /LB/=BEI | $ ^"由"=BEI | $ ^"经"=BEI | $ ^"经过"=BEI | $ ^"随"=BEI | $ ^"为"=BEI | $ ^"以"=BEI | $ ^"经由"=BEI ]', with_context=True) s, bei = ctx.s, ctx.bei bei.category = bei.category.clone_with(right=s.category) bei.category.left._right = pred.category bei.parent.category = bei.category.left debug("new bei category: %s", bei.category) return bei
def do_tgrep_with_callback(root, pattern, callback, **kwargs): new_root = None for match_node, context in tgrep(root, pattern, with_context=True, **kwargs): debug("Callback %s matched", callback.__name__) if context: # only supply a context if the expression binds variables # smash the case, variables in tgrep expressions are case insensitive result = callback(match_node, **smash_key_case(context)) else: result = callback(match_node) # a new root will be returned if one has been installed if result: new_root = result return new_root or root
def relabel_bei_category(self, top, pred): # particle 'you' is tagged as a preposition but acts as the BEI marker bei, ctx = get_first( top, r'*=S [ $ /LB/=BEI | $ ^"由"=BEI | $ ^"经"=BEI | $ ^"经过"=BEI | $ ^"随"=BEI | $ ^"为"=BEI | $ ^"以"=BEI | $ ^"经由"=BEI ]', with_context=True) s, bei = ctx.s, ctx.bei bei.category = bei.category.clone_with(right=s.category) bei.category.left._right = pred.category bei.parent.category = bei.category.left debug("new bei category: %s", bei.category) return bei
def fix_subject_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N debug("%s", reduced) node = n debug("Fixing subject extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=node) if not self.relabel_relativiser(pred): # TOP is the shrunk VP # after shrinking, we can get VV or VA here # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7)) result = get_first( node, r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/', with_context=True, left_to_right=True) if not result: debug( 'Could not find verbal category; did not create null relativiser.' ) return top, context = result SS = context.ss.category debug("Creating null relativiser unary category: %s", SS / SS) replace_kid(top.parent, top, Node("NN", [top], SS / SS, head_index=0))
def relabel_relativiser(self, node): # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP) result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True) if result is not None: _, context = result s, relativiser = context.s, context.rel relativiser.category = relativiser.category.clone_with(right=s.category) debug("New rel category: %s", relativiser.category) return True else: warn("Couldn't find relativiser under %s", node) return False
def fix_object_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N node = n debug("Fixing object extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'/[IC]P/=TOP << { *=PP < { *=P < { /NP-(OBJ|EXT)/=T << ^/\*T\*%s/ $ *=S } } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): top, pp, p, t, s = ctx.top, ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=top) # If we couldn't find the DEC node, this is the null relativiser case if not self.relabel_relativiser(pred): # TOP is the S node # null relativiser category comes from sibling of TOP # if TOP has no sibling, then we're likely inside a NP-PRD < CP reduced relative (cf 1:2(9)) result = get_first(top, r'* $ *=SS', with_context=True, nonrecursive=True) if result: _, ctx = result; ss = ctx.ss debug("Creating null relativiser unary category: %s", ss.category/ss.category) replace_kid(top.parent, top, Node("NN", [top], ss.category/ss.category, head_index=0))
def fix_whword_topicalisation(self, node, p, s, t): debug('Fixing wh-word topicalisation: node: %s', lrp_repr(node)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid( p, t, Node(base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, SbNP, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) expr = r'*=PP < { /VP/=P < { /NP-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } }' % index for top, ctx in find_all(p, expr, with_context=True): replace_kid(ctx.pp, ctx.p, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def fix_ba_object_gap(self, top, ba, c, d): # for trace_NP, ctx in find_all(d, r'*=PP < {*=P < { /NP-OBJ/=T < ^/\*-/ $ *=S } }', with_context=True): for trace_NP, ctx in find_all(d, r'{ { { /NP-OBJ/=T < ^/\*-/ } $ *=S } > { *=P > *=PP } }', with_context=True): print 'FOUND' debug("Found %s", trace_NP) pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) # self.fix_categories_starting_from(s, until=top) # debug("Fixing ba-construction object gap: %s" % lrp_repr(node)) # # for trace_NP, ctx in find_all(top, r'*=PP < {*=P < { /NP-OBJ/=T < ^/\*-/ $ *=S } }', with_context=True): # debug("Found %s", trace_NP) # pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # # self.fix_object_gap(pp, p, t, s) # self.fix_categories_starting_from(s, until=c) # self.relabel_ba_category(top, ba, s)
def fix_whword_topicalisation(self, node, p, s, t): debug('Fixing wh-word topicalisation: node: %s', lrp_repr(node)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid(p, t, Node( base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, SbNP, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) expr = r'*=PP < { /VP/=P < { /NP-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } }' % index for top, ctx in find_all(p, expr, with_context=True): replace_kid(ctx.pp, ctx.p, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def relabel_relativiser(self, node): # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP) result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True) if result is not None: _, context = result s, relativiser = context.s, context.rel relativiser.category = relativiser.category.clone_with( right=s.category) debug("New rel category: %s", relativiser.category) return True else: warn("Couldn't find relativiser under %s", node) return False
def multi_tgrep(deriv, query_callback_map): if not query_callback_map: raise RuntimeError('No query expressions given.') initialise() if _tgrep_debug: for expression in query_callback_map.keys(): debug("Lexing %s", expression) lex.input(expression) for tok in iter(lex.token, None): debug("\t%s %s", tok.type, tok.value) queries = [yacc.parse(expression) for expression in query_callback_map.keys()] for node in nodes(deriv): for query_expr, query_str in izip(queries, query_callback_map.keys()): context = Context() if query_expr.is_satisfied_by(node, context): if context: query_callback_map[query_str](node, **smash_key_case(context)) else: query_callback_map[query_str](node)
def fix_subject_extraction(self, _, n, pred, w=None, reduced=False): global use_bare_N debug("%s", reduced) node = n debug("Fixing subject extraction: %s", lrp_repr(node)) # We only want this if we are using the N -> NP unary rule # This 'fix' lets us rewrite NP(WHNP CP) as NP(CP) with categories NP(N) if use_bare_N and pred.tag.startswith('NP'): # Fix for the NP(VP de) case: # --------------------------- # NP NP # / \ | # WHNP CP --> CP # / \ / \ # IP DEC IP DEC if not pred.is_leaf(): pred.kids.pop(0) pred.head_index = 0 else: if not reduced: self.remove_null_element(node) if w: index = get_trace_index_from_tag(w.tag) else: index = '' expr = r'*=PP < { *=P < { /NP-SBJ/=T << ^/\*T\*%s/ $ *=S } }' % index for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) self.fix_categories_starting_from(s, until=node) if not self.relabel_relativiser(pred): # TOP is the shrunk VP # after shrinking, we can get VV or VA here # left_to_right so that we find the right node (used to match against the CP 已建成的 in 4:45(7)) result = get_first(node, r'{ /([ICV]P|V[VA]|VRD|VSB|VCD)/=TOP $ *=SS } ! > /([ICV]P|V[VA]|VRD|VSB|VCD)/', with_context=True, left_to_right=True) if not result: debug('Could not find verbal category; did not create null relativiser.') return top, context = result SS = context.ss.category debug("Creating null relativiser unary category: %s", SS/SS) replace_kid(top.parent, top, Node("NN", [top], SS/SS, head_index=0))
def fix_nongap_extraction(self, _, n, pred, k): node = n debug("Fixing nongap extraction: %s", pprint(node)) debug("k %s", pprint(k)) self.remove_null_element(node) index = get_trace_index_from_tag(k.tag) expr = (r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }' % { 'tags': ModifierTagsRegex, 'index': index }) # we use "<<" in the expression, because fix_*_topicalisation comes # before fix_nongap_extraction, and this can introduce an extra layer between # the phrasal tag and the trace for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) if not self.relabel_relativiser(pred): top, context = get_first(node, r'/[ICV]P/=TOP $ *=SS', with_context=True) ss = context.ss debug("Creating null relativiser unary category: %s", ss.category/ss.category) replace_kid(top.parent, top, Node("NN", [top], ss.category/ss.category, head_index=0))
def fix_nongap_extraction(self, _, n, pred, k): node = n debug("Fixing nongap extraction: %s", pprint(node)) debug("k %s", pprint(k)) self.remove_null_element(node) index = get_trace_index_from_tag(k.tag) expr = ( r'*=PP < { *=P < { /[NPQ]P(?:-%(tags)s)?%(index)s/=T << ^/\*T\*/ $ *=S } }' % { 'tags': ModifierTagsRegex, 'index': index }) # we use "<<" in the expression, because fix_*_topicalisation comes # before fix_nongap_extraction, and this can introduce an extra layer between # the phrasal tag and the trace for trace_NP, ctx in find_all(node, expr, with_context=True): pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # remove T from P # replace P with S self.fix_object_gap(pp, p, t, s) if not self.relabel_relativiser(pred): top, context = get_first(node, r'/[ICV]P/=TOP $ *=SS', with_context=True) ss = context.ss debug("Creating null relativiser unary category: %s", ss.category / ss.category) replace_kid( top.parent, top, Node("NN", [top], ss.category / ss.category, head_index=0))
def unify(L, R, ignore=False, copy_vars=True): assgs = [] for (Ls, Rs) in izip(L.nested_compound_categories(), R.nested_compound_categories()): if Ls.slot.is_filled() and Rs.slot.is_filled(): if (not ignore) and Ls.slot.head.lex != Rs.slot.head.lex: raise UnificationException('%s (%s) and %s (%s) both filled' % (Ls, Ls.slot, Rs, Rs.slot)) elif Ls.slot.is_filled(): debug('R %s <- L %s', Rs.slot, Ls.slot) Rs.slot.head.lex = Ls.slot.head.lex Rs.slot.head.filler = L assgs.append( (Rs, Ls.slot.head.lex) ) elif Rs.slot.is_filled(): debug('L %s <- R %s', Ls.slot, Rs.slot) Ls.slot.head.lex = Rs.slot.head.lex Ls.slot.head.filler = R assgs.append( (Ls, Rs.slot.head.lex) ) else: # both slots are variables, need to unify variables if Ls.slot == Rs.slot: continue debug('%s <-> %s (copy_vars=%s)', Ls.slot, Rs.slot, copy_vars) if copy_vars: Rs.slot.unify_heads(Ls.slot) assgs.append( (Rs, Ls) ) return assgs
def tgrep(deriv, expression, with_context=False, nonrecursive=False, left_to_right=False): '''Performs the given tgrep query on the given tree. If _with_context_ is True, each matched node yields a pair (node, context), and captured nodes are accessible by name using the dict-like context. If the user wants to keep context around, a copy must be made.''' if not expression: raise RuntimeError('No query expression given.') query = expression_cache.get(expression, None) if query is None: initialise() if _tgrep_debug: debug("Lexing %s", expression) lex.input(expression) for tok in iter(lex.token, None): debug("%s %s", tok.type, tok.value) query = yacc.parse(expression) expression_cache[expression] = query # Default traversal method is right to left traversal_method = (single if nonrecursive else nodes if left_to_right else nodes_reversed) context = Context() for node in traversal_method(deriv): context.clear() if query.is_satisfied_by(node, context): if _tgrep_debug: debug("%s matched %s", lrp_repr(node), query) if with_context: yield node, context else: yield node
def fix_ba_object_gap(self, top, ba, c, d): # for trace_NP, ctx in find_all(d, r'*=PP < {*=P < { /NP-OBJ/=T < ^/\*-/ $ *=S } }', with_context=True): for trace_NP, ctx in find_all( d, r'{ { { /NP-OBJ/=T < ^/\*-/ } $ *=S } > { *=P > *=PP } }', with_context=True): print 'FOUND' debug("Found %s", trace_NP) pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s self.fix_object_gap(pp, p, t, s) # self.fix_categories_starting_from(s, until=top) # debug("Fixing ba-construction object gap: %s" % lrp_repr(node)) # # for trace_NP, ctx in find_all(top, r'*=PP < {*=P < { /NP-OBJ/=T < ^/\*-/ $ *=S } }', with_context=True): # debug("Found %s", trace_NP) # pp, p, t, s = ctx.pp, ctx.p, ctx.t, ctx.s # # self.fix_object_gap(pp, p, t, s) # self.fix_categories_starting_from(s, until=c) # self.relabel_ba_category(top, ba, s)
def fix_topicalisation_with_gap(self, node, p, s, t): debug("Fixing topicalisation with gap:\nnode=%s\ns=%s\nt=%s", lrp_repr(node), pprint(s), pprint(t)) # stop this method from matching again (in case there's absorption on the top node, cf 2:22(5)) t.tag = base_tag(t.tag, strip_cptb_tag=False) # create topicalised category based on the tag of T typeraise_t_category = ptb_to_cat(t) # insert a node with the topicalised category replace_kid( p, t, Node(base_tag(t.tag, strip_cptb_tag=False), [t], typeraise(typeraise_t_category, S, TR_TOPICALISATION), head_index=0)) index = get_trace_index_from_tag(t.tag) # attested gaps: # 575 IP-TPC:t # 134 NP-TPC:t # 10 IP-Q-TPC:t # 8 CP-TPC:t # 4 NP-PN-TPC:t # 2 QP-TPC:t # 2 NP-TTL-TPC:t # 1 PP-TPC:t # 1 IP-IJ-TPC:t # 1 INTJ-TPC:t # 1 CP-Q-TPC:t # 1 CP-CND-TPC:t expr = r'/IP/=TOP << { *=PP < { *=P < { /[NICQP]P-(?:SBJ|OBJ)/=T < ^/\*T\*%s/ $ *=S } } }' % index for top, ctx in find_all(s, expr, with_context=True): debug('top: %s', pprint(top)) self.fix_object_gap(ctx.pp, ctx.p, ctx.t, ctx.s) self.fix_categories_starting_from(ctx.s, until=top)
def multi_tgrep(deriv, query_callback_map): if not query_callback_map: raise RuntimeError('No query expressions given.') initialise() if _tgrep_debug: for expression in query_callback_map.keys(): debug("Lexing %s", expression) lex.input(expression) for tok in iter(lex.token, None): debug("\t%s %s", tok.type, tok.value) queries = [ yacc.parse(expression) for expression in query_callback_map.keys() ] for node in nodes(deriv): for query_expr, query_str in izip(queries, query_callback_map.keys()): context = Context() if query_expr.is_satisfied_by(node, context): if context: query_callback_map[query_str](node, **smash_key_case(context)) else: query_callback_map[query_str](node)
def output(self): appl_accepted, _ = self.appl_only_filter.compute_accepts_and_rejects() null_accepted, _ = self.null_only_filter.compute_accepts_and_rejects() # Start by collecting the manually annotated slashes from the file aggregate = self.parse_annoform(self.manual_fn) for (set, mode_name) in ( (appl_accepted, 'apply'), (null_accepted, 'null') ): for (cat_string, slash_index, applied_frequency, total_frequency) in \ sorted(set, key=lambda this: this[2], reverse=True): # aggregate[re.sub(r'[-*@.]', '', cat_string)][slash_index] = mode_name # If there is a slash-mode entry in _aggregate_ already (from the manual list), # do not overwrite it. slash_to_mode_map = aggregate[re.sub(r'[-*@.]', '', cat_string)] if slash_index not in slash_to_mode_map: debug("Slash %d of %s will have mode %s", slash_index, cat_string, mode_name) slash_to_mode_map[slash_index] = mode_name else: debug("Not overwriting slash %d of %s", slash_index, cat_string) for (category_string, slashes) in aggregate.iteritems(): for (slash_index, mode_name) in slashes.iteritems(): print " ".join((category_string, mode_name, str(slash_index)))
# Chinese CCGbank conversion # ========================== # (c) 2008-2012 Daniel Tse <*****@*****.**> # University of Sydney # Use of this software is governed by the attached "Chinese CCGbank converter Licence Agreement" # supplied in the Chinese CCGbank conversion distribution. If the LICENCE file is missing, please # notify the maintainer Daniel Tse <*****@*****.**>. try: import ply.lex as lex import ply.yacc as yacc except ImportError: import lex, yacc from munge.util.err_utils import debug, info import munge.proc.tgrep.parse as parse if __name__ == '__main__': import sys lex.lex(module=parse) expr = sys.argv[1] debug("Lexing %s", expr) lex.input(expr) for tok in iter(lex.token, None): debug("%s %s", tok.type, tok.value)
def accept_leaf(self, leaf): cat_string_without_modes = leaf.cat.__repr__(show_modes=False) # Hide modes if cat_string_without_modes in self.substs: debug("Substituting %s with %s", cat_string_without_modes, self.substs[cat_string_without_modes]) leaf.cat = self.substs[cat_string_without_modes]
def fix_reduced_long_bei_gap(self, node, *args, **kwargs): debug("Fixing reduced long bei gap: %s", lrp_repr(node)) return self.fix_long_bei_gap(node, *args, **update(kwargs, reduced=True))
def fix_categories_starting_from(self, node, until): '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct CCG analysis.''' while node is not until: # Only fix binary rules if (not node.parent) or node.parent.count() < 2: break l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) debug("L: %s R: %s P: %s", L, R, P) applied_rule = analyse(L, R, P) debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R, ''.join(r.text()), P, ''.join(p.text()), applied_rule) if applied_rule is None: debug("invalid rule %s %s -> %s", L, R, P) if R.is_complex() and R.left.is_complex( ) and L == R.left.right: # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # conj R -> P # Make P into R[conj] # L cannot be the comma category (,), otherwise we get a mis-analysis # in 2:22(5) if str(L) in ('conj', 'LCM'): p.category = R.clone_adding_feature('conj') debug("New category: %s", p.category) # L R[conj] -> P elif R.has_feature('conj'): new_L = L.clone() r.category = new_L.clone_adding_feature('conj') p.category = new_L debug("New category: %s", new_L) elif L.is_leaf(): # , R -> P[conj] becomes , R -> R[conj] if P.has_feature('conj') and l.tag in ( 'PU', 'CC'): # treat as partial coordination debug("Fixing coordination: %s" % P) p.category = r.category.clone_adding_feature('conj') debug("new parent category: %s" % p.category) # , R -> P becomes , R -> R elif l.tag == "PU" and not P.has_feature( 'conj'): # treat as absorption debug("Fixing left absorption: %s" % P) p.category = r.category # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y elif R.is_complex() and R.left.is_complex( ) and L == R.left.right: T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif R.is_leaf(): # R , -> P becomes R , -> R if r.tag == "PU": # treat as absorption debug("Fixing right absorption: %s" % P) p.category = l.category # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) else: new_parent_category = None # try typeraising fix # T/(T/X) (T\A)/X -> T can be fixed: # (T\A)/((T\A)/X) (T\A)/X -> T\A if self.is_topicalisation(L) and (L.right.right == R.right and P == L.left and P == R.left.left): T_A = R.left X = R.right l.category = T_A / (T_A / X) new_parent_category = T_A # (X|X)|Z Y -> X becomes # (X|X)|Z X|(X|X) -> X|Z elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise( R, R, TR_BACKWARD, strip_features=False) #T/(T|L) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # Generalise over right modifiers of verbal categories (S[dcl]\X)$ elif self.is_verbal_category( L) and L.is_complex() and L.left.is_complex(): T = L.left.right new_category = typeraise(R, T, TR_BACKWARD) debug('Trying out %s', new_category) if bxcomp(L, new_category): node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) # Last ditch: try all of the composition rules to generalise over L R -> P if not new_parent_category: # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3)) # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category new_parent_category = ( fcomp(L, R) or bcomp(L, R, when=not self.is_relativiser(R)) or bxcomp( L, R, when=not self.is_relativiser(R) ) #or bxcomp2(L, R, when=self.is_verbal_category(L)) or fxcomp(L, R, when=not self.is_relativiser(R))) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category else: debug("couldn't fix, skipping") node = node.parent debug('')
def fix_categories_starting_from(self, node, until): '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct CCG analysis.''' while node is not until: # Only fix binary rules if (not node.parent) or node.parent.count() < 2: break l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) debug("L: %s R: %s P: %s", L, R, P) applied_rule = analyse(L, R, P) debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R, ''.join(r.text()), P, ''.join(p.text()), applied_rule) if applied_rule is None: debug("invalid rule %s %s -> %s", L, R, P) if R.is_complex() and R.left.is_complex() and L == R.left.right: # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y T = R.left.left new_category = typeraise(L, T, TR_FORWARD)#T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif L.is_complex() and L.left.is_complex() and R == L.left.right: # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y T = L.left.left new_category = typeraise(R, T, TR_BACKWARD)#T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # conj R -> P # Make P into R[conj] # L cannot be the comma category (,), otherwise we get a mis-analysis # in 2:22(5) if str(L) in ('conj', 'LCM'): p.category = R.clone_adding_feature('conj') debug("New category: %s", p.category) # L R[conj] -> P elif R.has_feature('conj'): new_L = L.clone() r.category = new_L.clone_adding_feature('conj') p.category = new_L debug("New category: %s", new_L) elif L.is_leaf(): # , R -> P[conj] becomes , R -> R[conj] if P.has_feature('conj') and l.tag in ('PU', 'CC'): # treat as partial coordination debug("Fixing coordination: %s" % P) p.category = r.category.clone_adding_feature('conj') debug("new parent category: %s" % p.category) # , R -> P becomes , R -> R elif l.tag == "PU" and not P.has_feature('conj'): # treat as absorption debug("Fixing left absorption: %s" % P) p.category = r.category # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y elif R.is_complex() and R.left.is_complex() and L == R.left.right: T = R.left.left new_category = typeraise(L, T, TR_FORWARD)#T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif R.is_leaf(): # R , -> P becomes R , -> R if r.tag == "PU": # treat as absorption debug("Fixing right absorption: %s" % P) p.category = l.category # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y elif L.is_complex() and L.left.is_complex() and R == L.left.right: T = L.left.left new_category = typeraise(R, T, TR_BACKWARD)#T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) else: new_parent_category = None # try typeraising fix # T/(T/X) (T\A)/X -> T can be fixed: # (T\A)/((T\A)/X) (T\A)/X -> T\A if self.is_topicalisation(L) and ( L.right.right == R.right and P == L.left and P == R.left.left): T_A = R.left X = R.right l.category = T_A/(T_A/X) new_parent_category = T_A # (X|X)|Z Y -> X becomes # (X|X)|Z X|(X|X) -> X|Z elif L.is_complex() and L.left.is_complex() and R == L.left.right: T = L.left.left new_category = typeraise(R, R, TR_BACKWARD, strip_features=False)#T/(T|L) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # Generalise over right modifiers of verbal categories (S[dcl]\X)$ elif self.is_verbal_category(L) and L.is_complex() and L.left.is_complex(): T = L.left.right new_category = typeraise(R, T, TR_BACKWARD) debug('Trying out %s', new_category) if bxcomp(L, new_category): node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) # Last ditch: try all of the composition rules to generalise over L R -> P if not new_parent_category: # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3)) # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category new_parent_category = (fcomp(L, R) or bcomp(L, R, when=not self.is_relativiser(R)) or bxcomp(L, R, when=not self.is_relativiser(R)) #or bxcomp2(L, R, when=self.is_verbal_category(L)) or fxcomp(L, R, when=not self.is_relativiser(R))) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category else: debug("couldn't fix, skipping") node = node.parent debug('')
def fix_ip_app(self, p, a, s): debug("Fixing IP-APP NX: %s", lrp_repr(p)) new_kid = copy(a) new_kid.tag = base_tag(new_kid.tag) # relabel to stop infinite matching replace_kid(p, a, Node("NN", [new_kid], s.category/s.category, head_index=0))