def output(self): headings = ("(%s ,) %s -> %s", "(, %s) %s -> %s", "%s (%s ,) -> %s", "%s (, %s) -> %s") for index, heading in enumerate(headings): env_hash = getattr(self, "e%d" % index) print heading % ("X", "Y", "Z") print "-" * len(heading) examples_hash = getattr(self, "e%d_examples" % index) for (l, r, p), f in sorted_by_value_desc(env_hash): triple = heading % (l, r, p) print "% 10d [%28s] %-60s %s" % ( f, analyse(C(l), C(r), C(p)), triple, " ".join(examples_hash[(l, r, p)]), ) if index == 0 and (l, r, p) in self.e3 and self.e3[(l, r, p)] <= f: alt_triple = headings[3] % (l, r, p) alt_freq = self.e3[(l, r, p)] print "* % 8d%32s%-60s %s" % (alt_freq, " " * 32, alt_triple, " ".join(self.e3_examples[(l, r, p)])) elif index == 3 and (l, r, p) in self.e0 and self.e0[(l, r, p)] <= f: alt_triple = headings[0] % (l, r, p) alt_freq = self.e0[(l, r, p)] print "* % 8d%32s%-60s %s" % (alt_freq, " " * 32, alt_triple, " ".join(self.e0_examples[(l, r, p)]))
def output(self): headings = ("(%s ,) %s -> %s", "(, %s) %s -> %s", "%s (%s ,) -> %s", "%s (, %s) -> %s") for index, heading in enumerate(headings): env_hash = getattr(self, 'e%d' % index) print heading % ('X', 'Y', 'Z') print "-" * len(heading) examples_hash = getattr(self, 'e%d_examples' % index) for (l, r, p), f in sorted_by_value_desc(env_hash): triple = heading % (l, r, p) print "% 10d [%28s] %-60s %s" % (f, analyse(C(l), C(r), C(p)), triple, ' '.join(examples_hash[(l, r, p)])) if (index == 0 and (l, r, p) in self.e3 and self.e3[(l, r, p)] <= f): alt_triple = headings[3] % (l, r, p) alt_freq = self.e3[(l, r, p)] print "* % 8d%32s%-60s %s" % (alt_freq, " "*32, alt_triple, ' '.join(self.e3_examples[(l, r, p)])) elif (index == 3 and (l, r, p) in self.e0 and self.e0[(l, r, p)] <= f): alt_triple = headings[0] % (l, r, p) alt_freq = self.e0[(l, r, p)] print "* % 8d%32s%-60s %s" % (alt_freq, " "*32, alt_triple, ' '.join(self.e0_examples[(l, r, p)]))
def output(self): as_left = {} as_right = {} for (l, r, p), f in self.envs.iteritems(): if l == ',': as_left[(l, r, p)] = f else: as_right[(l, r, p)] = f print ", _ -> _" print "--------" for (l, r, p), f in sorted_by_value_desc(as_left): print "% 10d [%28s] %s %20s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p) print "_ , -> _" print "--------" for (l, r, p), f in sorted_by_value_desc(as_right): print "% 10d [%28s] %20s %s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
def output(self): as_left = {} as_right = {} for (l, r, p), f in self.envs.iteritems(): if l == ",": as_left[(l, r, p)] = f else: as_right[(l, r, p)] = f print ", _ -> _" print "--------" for (l, r, p), f in sorted_by_value_desc(as_left): print "% 10d [%28s] %s %20s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p) print "_ , -> _" print "--------" for (l, r, p), f in sorted_by_value_desc(as_right): print "% 10d [%28s] %20s %s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
def ccg2latex(root, glosses=None, abbreviate=False): def comb_symbol(comb): return arrows.get(comb, 'uline') def cat_repr(cat, i): cat_str = str(cat) if abbreviate is not False: if isinstance(abbreviate, xrange): if isinstance(i, int): if i in abbreviate: cat_str = abbr(cat_str) elif isinstance(i, xrange): if abbreviate.start <= i.start < i.end <= abbreviate.end: cat_str = abbr(cat_str) else: cat_str = abbr(cat_str) return sanitise_category(cat_str) out = ['\deriv{%d}{' % root.leaf_count()] all_leaves = list(leaves(root)) # lex line if glosses is not None: leaf_bits = ("\\glosN{%s}{%s}" % (leaf.lex, gloss) for (leaf, gloss) in izip(all_leaves, glosses)) else: leaf_bits = (("\\cjk{%s}" % leaf.lex) for leaf in all_leaves) out.append(' & '.join(leaf_bits) + '\\\\') # underlines line out.append( ' & '.join(["\uline{1}"] * root.leaf_count()) + '\\\\' ) # cats line out.append( (' & '.join(("\\cf{%s}"%cat_repr(leaf.cat, i) for i, leaf in enumerate(all_leaves)))) + '\\\\' ) rows = [] for l, r, p in pairs_postorder(root): rows.append( (min_leaf_id(p, root), p.cat, analyse(l.cat, r and r.cat, p.cat), p.leaf_count()) ) grouped_subrows = group(rows) for subrows in grouped_subrows: subline = [] subout = [] last_span = 0 # holds the index of the rightmost span in this row for leftmost_leaf_id, cat, comb, span in subrows: subline.append( "&"*(leftmost_leaf_id - last_span) + ("\%s{%s}" % (comb_symbol(comb), span)) ) subout.append( "&"*(leftmost_leaf_id - last_span) + ("\mc{%d}{%s}" % (span, cat_repr(cat, range(leftmost_leaf_id, leftmost_leaf_id+span)))) ) last_span = leftmost_leaf_id+span-1 # write out underlines line out.append(' '.join(subline) + '\\\\') # write out cats line out.append(' '.join(subout) + '\\\\') out.append('}') return '\n'.join(out)
def output(self): as_left = {} as_right = {} for (side, l, r, p), f in self.envs.iteritems(): if side == AnalyseAbsorption.LEFT: as_left[ (l, r, p) ] = f elif side == AnalyseAbsorption.RIGHT: as_right[ (l, r, p) ] = f print "(X ,) Y -> Z" print "--------" for (l, r, p), f in sorted_by_value_desc(as_left): print "% 10d [%28s] (%s ,) %s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p) print "X (, Y) -> Z" print "--------" for (l, r, p), f in sorted_by_value_desc(as_right): print "% 10d [%28s] %s (, %s) -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue self.total += 1 result = analyse(node.lch.cat, node.rch and node.rch.cat, node.cat) if result == 'l_punct_absorb': self.l += 1 elif result == 'r_punct_absorb': self.r += 1 else: self.other += 1
def percolate(deriv): '''This percolates mode changes made at the leaves up the derivation tree.''' for lch, rch in level_order_pairs(deriv): parent = lch.parent # (== rch.parent) if rch or parent: # If lch is not the root comb = analyse(lch.cat, rch and rch.cat, parent.cat) if str(comb) == 'bwd_r1xcomp': # (Y/a)/Z X\Y -> (X/a)/Z # ^ ^ ^ ^ # | |_______| | # |______________| copy_modes(lch.cat, parent.cat) copy_modes(rch.cat.left, parent.cat.left.left) elif str(comb).endswith('comp'): # is composition lmode = lch.cat.mode # lch and rch are both necessarily compound rmode = rch.cat.mode # also check that lmode and rmode include composition # maybe issue message saying broken derivation otherwise parent.cat.mode = mode_min(lmode, rmode) if comb in ("fwd_comp", "fwd_xcomp"): # X/Y Y/Z -> X/Z or X/Y Y\Z -> X\Z # copy modes from arguments to result copy_modes(lch.cat.left, parent.cat.left) copy_modes(rch.cat.right, parent.cat.right) elif comb in ("bwd_comp", "bwd_xcomp"): # Y\Z X\Y -> X\Z or Y/Z X\Y -> X/Z copy_modes(rch.cat.left, parent.cat.left) copy_modes(lch.cat.right, parent.cat.right) elif str(comb).endswith('type'): # is type raising # uses default mode (which is :all) pass elif str(comb) == "r_punct_absorb": # X ; -> X copy_modes(lch.cat, parent.cat) elif str(comb).endswith('absorb') or str(comb) == "conjoin": copy_modes(rch.cat, parent.cat) else: # assume application if comb == "fwd_appl": # X/Y Y -> X copy_modes(lch.cat.left, parent.cat) elif comb == "bwd_appl": # Y X\Y -> X copy_modes(rch.cat.left, parent.cat)
def make_derivation(deriv, assigned_id=None, leaf_id=0): '''Generates the body of the DOT representation.''' if deriv.is_leaf(): if write_tree_indices: label = "%d %s" % (leaf_id, deriv.label_text()) else: label = deriv.label_text() return '''%s [shape="none",height=0.17,label="%s"]\n''' % (assigned_id, label) else: ret = [] root_id = assigned_id or get_id() for i, child in enumerate(deriv): child_id = get_id() if isinstance(deriv, (ccg.Leaf, ccg.Node)): comb_name = re.escape(Abbreviations.get(analyse(deriv.lch.cat, deriv.rch and deriv.rch.cat, deriv.cat), '')) if comb_name: shape_type = "record" label_text = "<o>%s|%s" % (deriv.label_text(), comb_name) else: shape_type = "box" label_text = deriv.label_text() ret.append('''%s [shape="%s",height=0.1,label="%s"]\n''' % (root_id, shape_type, label_text)) if config.highlight_head_arrows and i == int(deriv.head_index): ret.append("%s:o -> %s:o [color=red]\n" % (root_id, child_id)) else: ret.append("%s:o -> %s:o\n" % (root_id, child_id)) ret.append(make_derivation(child, child_id, leaf_id=leaf_id)) leaf_id += len(list(leaves(child))) else: ret.append('''%s [shape="box",height=0.1,label="%s"]\n''' % (root_id, deriv.label_text())) ret.append("%s -> %s\n" % (root_id, child_id)) ret.append(make_derivation(child, child_id, leaf_id=leaf_id)) leaf_id += len(list(leaves(child))) return ''.join(ret)
def applications_per_slash_with_path(orig_path, slash_count, examine_modes=False): '''Given a category, returns a list whose index _i_ is the rule which consumed its _i_th slash, or None if it was not consumed.''' result = [] for slash in xrange(slash_count): consumer = None # the rule which consumed this slash, if any first = True # We need to copy the path for each slash, because in each iteration we label # the categories in-place. orig_path, path = tee(orig_path, 2) for (prev_l, prev_r, prev_was_flipped), (l, r, was_flipped) in each_pair(path): if first: if prev_was_flipped and prev_r: prev_r.labelled() elif not prev_was_flipped: prev_l.labelled() first = False cur = r if was_flipped else l prev_cur = prev_r if prev_was_flipped else prev_l rule = analyse(prev_l, prev_r, cur, examine_modes) label_result(cur, prev_cur, rule, prev_was_flipped) if rule == 'fwd_appl': consumed_category = prev_l elif rule == 'bwd_appl': consumed_category = prev_r elif rule in ('fwd_comp', 'bwd_comp', 'bwd_xcomp', 'fwd_xcomp'): consumed_category = prev_cur else: consumed_category = None if consumed_category and consumed_category.label == slash: consumer = rule break result.append(consumer) return result
def _label_result(l, r, p): L, R, P = l.cat, r.cat if r else None, p.cat app = analyse(L, R, P) if not app: return # print '> %s %s %s %s' % (app, L, R, p.cat) if app == 'fwd_appl': # X/Y Y if L.left.label is not None: P.labelled(L.left.label) elif app == 'bwd_appl': # Y X\Y if R.left.label is not None: P.labelled(R.left.label) elif (app in ('fwd_raise', 'bwd_raise') or app.endswith('gap_topicalisation')): # X -> T|(T|X) if L.label is not None: P.right.right.labelled(L.label) elif app in ('fwd_comp', 'fwd_xcomp'): # assume left headed if L.left.label is not None: P.left.labelled(L.left.label) elif app in ('bwd_comp', 'bwd_xcomp'): if R.left.label is not None: P.left.labelled(R.left.label)
def applications_per_slash_with_path(orig_path, slash_count, examine_modes=False): '''Given a category, returns a list whose index _i_ is the rule which consumed its _i_th slash, or None if it was not consumed.''' result = [] for slash in xrange(slash_count): consumer = None # the rule which consumed this slash, if any first = True # We need to copy the path for each slash, because in each iteration we label # the categories in-place. orig_path, path = tee(orig_path, 2) for (prev_l, prev_r, prev_was_flipped), (l, r, was_flipped) in each_pair(path): if first: if prev_was_flipped and prev_r: prev_r.labelled() elif not prev_was_flipped: prev_l.labelled() first = False cur = r if was_flipped else l prev_cur = prev_r if prev_was_flipped else prev_l rule = analyse(prev_l, prev_r, cur, examine_modes) label_result(cur, prev_cur, rule, prev_was_flipped) if rule == 'fwd_appl': consumed_category = prev_l elif rule == 'bwd_appl': consumed_category = prev_r elif rule in ('fwd_comp', 'bwd_comp', 'bwd_xcomp', 'fwd_xcomp'): consumed_category = prev_cur else: consumed_category = None if consumed_category and consumed_category.label == slash: consumer = rule break result.append( consumer ) return result
def filter_fn(node): comb = analyse(node[0].category, node[1].category, node.category) return comb not in ('l_punct_absorb', 'r_punct_absorb', 'funny_conj')
def output(self): for (l, r, p), freq in sorted_by_value_desc(self.counts): print "%8d | %15s %-15s -> %-15s [%s]" % (freq, l, r, p, analyse(l, r, p))
def is_ucp(l, r, p): if r is None: return False return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r for file in glob(sys.argv[1]): for bundle in CCGbankReader(file): has_unrecognised_rules, has_ucp = False, False for node in nodes(bundle.derivation): if node.is_leaf(): continue lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node)) comb = analyse(*lrp) l, r, p = lrp rule_tuple = (str(l), str(r), str(p)) if comb: combs[comb] += 1 elif is_ucp(*lrp): ucp_rules[rule_tuple] += 1 if not has_ucp: with_ucp += 1 has_ucp = True else: # Split unrecognised rules by type if r: binary[rule_tuple] += 1 else:
def applications_with_path(path): '''Yields a sequence of rule applications applied along a _path_ to the root.''' for (prev_l, prev_r, _), (l, r, was_flipped) in each_pair(path): yield analyse(prev_l, prev_r, r if was_flipped else l)
def fix_categories_starting_from(self, node, until): '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct CCG analysis.''' while node is not until: # Only fix binary rules if (not node.parent) or node.parent.count() < 2: break l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) debug("L: %s R: %s P: %s", L, R, P) applied_rule = analyse(L, R, P) debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R, ''.join(r.text()), P, ''.join(p.text()), applied_rule) if applied_rule is None: debug("invalid rule %s %s -> %s", L, R, P) if R.is_complex() and R.left.is_complex( ) and L == R.left.right: # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # conj R -> P # Make P into R[conj] # L cannot be the comma category (,), otherwise we get a mis-analysis # in 2:22(5) if str(L) in ('conj', 'LCM'): p.category = R.clone_adding_feature('conj') debug("New category: %s", p.category) # L R[conj] -> P elif R.has_feature('conj'): new_L = L.clone() r.category = new_L.clone_adding_feature('conj') p.category = new_L debug("New category: %s", new_L) elif L.is_leaf(): # , R -> P[conj] becomes , R -> R[conj] if P.has_feature('conj') and l.tag in ( 'PU', 'CC'): # treat as partial coordination debug("Fixing coordination: %s" % P) p.category = r.category.clone_adding_feature('conj') debug("new parent category: %s" % p.category) # , R -> P becomes , R -> R elif l.tag == "PU" and not P.has_feature( 'conj'): # treat as absorption debug("Fixing left absorption: %s" % P) p.category = r.category # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y elif R.is_complex() and R.left.is_complex( ) and L == R.left.right: T = R.left.left new_category = typeraise(L, T, TR_FORWARD) #T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif R.is_leaf(): # R , -> P becomes R , -> R if r.tag == "PU": # treat as absorption debug("Fixing right absorption: %s" % P) p.category = l.category # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise(R, T, TR_BACKWARD) #T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) else: new_parent_category = None # try typeraising fix # T/(T/X) (T\A)/X -> T can be fixed: # (T\A)/((T\A)/X) (T\A)/X -> T\A if self.is_topicalisation(L) and (L.right.right == R.right and P == L.left and P == R.left.left): T_A = R.left X = R.right l.category = T_A / (T_A / X) new_parent_category = T_A # (X|X)|Z Y -> X becomes # (X|X)|Z X|(X|X) -> X|Z elif L.is_complex() and L.left.is_complex( ) and R == L.left.right: T = L.left.left new_category = typeraise( R, R, TR_BACKWARD, strip_features=False) #T/(T|L) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # Generalise over right modifiers of verbal categories (S[dcl]\X)$ elif self.is_verbal_category( L) and L.is_complex() and L.left.is_complex(): T = L.left.right new_category = typeraise(R, T, TR_BACKWARD) debug('Trying out %s', new_category) if bxcomp(L, new_category): node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) # Last ditch: try all of the composition rules to generalise over L R -> P if not new_parent_category: # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3)) # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category new_parent_category = ( fcomp(L, R) or bcomp(L, R, when=not self.is_relativiser(R)) or bxcomp( L, R, when=not self.is_relativiser(R) ) #or bxcomp2(L, R, when=self.is_verbal_category(L)) or fxcomp(L, R, when=not self.is_relativiser(R))) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category else: debug("couldn't fix, skipping") node = node.parent debug('')
def combinators_and_path_from_node(node): path = category_path_to_root(node) for (prev_l, prev_r, _), (l, r, was_flipped) in each_pair(path): l, r, p = prev_l, prev_r, r if was_flipped else l yield analyse(l, r, p), (l, r, p)
def fix_categories_starting_from(self, node, until): '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct CCG analysis.''' while node is not until: # Only fix binary rules if (not node.parent) or node.parent.count() < 2: break l, r, p = node.parent[0], node.parent[1], node.parent L, R, P = (n.category for n in (l, r, p)) debug("L: %s R: %s P: %s", L, R, P) applied_rule = analyse(L, R, P) debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R, ''.join(r.text()), P, ''.join(p.text()), applied_rule) if applied_rule is None: debug("invalid rule %s %s -> %s", L, R, P) if R.is_complex() and R.left.is_complex() and L == R.left.right: # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y T = R.left.left new_category = typeraise(L, T, TR_FORWARD)#T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif L.is_complex() and L.left.is_complex() and R == L.left.right: # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y T = L.left.left new_category = typeraise(R, T, TR_BACKWARD)#T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # conj R -> P # Make P into R[conj] # L cannot be the comma category (,), otherwise we get a mis-analysis # in 2:22(5) if str(L) in ('conj', 'LCM'): p.category = R.clone_adding_feature('conj') debug("New category: %s", p.category) # L R[conj] -> P elif R.has_feature('conj'): new_L = L.clone() r.category = new_L.clone_adding_feature('conj') p.category = new_L debug("New category: %s", new_L) elif L.is_leaf(): # , R -> P[conj] becomes , R -> R[conj] if P.has_feature('conj') and l.tag in ('PU', 'CC'): # treat as partial coordination debug("Fixing coordination: %s" % P) p.category = r.category.clone_adding_feature('conj') debug("new parent category: %s" % p.category) # , R -> P becomes , R -> R elif l.tag == "PU" and not P.has_feature('conj'): # treat as absorption debug("Fixing left absorption: %s" % P) p.category = r.category # L (X|L)|Y -> X|Y becomes # X|(X|L) (X|L)|Y -> X|Y elif R.is_complex() and R.left.is_complex() and L == R.left.right: T = R.left.left new_category = typeraise(L, T, TR_FORWARD)#T/(T|L) node.parent[0] = Node(l.tag, [l], new_category, head_index=0) new_parent_category = fcomp(new_category, R) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) elif R.is_leaf(): # R , -> P becomes R , -> R if r.tag == "PU": # treat as absorption debug("Fixing right absorption: %s" % P) p.category = l.category # (X|R)|Y R -> X|Y becomes # (X|R)|Y X|(X|R) -> X|Y elif L.is_complex() and L.left.is_complex() and R == L.left.right: T = L.left.left new_category = typeraise(R, T, TR_BACKWARD)#T|(T/R) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) else: new_parent_category = None # try typeraising fix # T/(T/X) (T\A)/X -> T can be fixed: # (T\A)/((T\A)/X) (T\A)/X -> T\A if self.is_topicalisation(L) and ( L.right.right == R.right and P == L.left and P == R.left.left): T_A = R.left X = R.right l.category = T_A/(T_A/X) new_parent_category = T_A # (X|X)|Z Y -> X becomes # (X|X)|Z X|(X|X) -> X|Z elif L.is_complex() and L.left.is_complex() and R == L.left.right: T = L.left.left new_category = typeraise(R, R, TR_BACKWARD, strip_features=False)#T/(T|L) node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category debug("New category: %s", new_category) # Generalise over right modifiers of verbal categories (S[dcl]\X)$ elif self.is_verbal_category(L) and L.is_complex() and L.left.is_complex(): T = L.left.right new_category = typeraise(R, T, TR_BACKWARD) debug('Trying out %s', new_category) if bxcomp(L, new_category): node.parent[1] = Node(r.tag, [r], new_category, head_index=0) new_parent_category = bxcomp(L, new_category) # Last ditch: try all of the composition rules to generalise over L R -> P if not new_parent_category: # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3)) # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category new_parent_category = (fcomp(L, R) or bcomp(L, R, when=not self.is_relativiser(R)) or bxcomp(L, R, when=not self.is_relativiser(R)) #or bxcomp2(L, R, when=self.is_verbal_category(L)) or fxcomp(L, R, when=not self.is_relativiser(R))) if new_parent_category: debug("new parent category: %s", new_parent_category) p.category = new_parent_category else: debug("couldn't fix, skipping") node = node.parent debug('')
def match_callback(self, match_node, bundle): key = analyse(match_node.lch.cat, match_node.rch.cat, match_node.cat) if key is None: key = 'other_typechange' # print "(%s %s -> %s) %s" % (match_node.lch.cat, match_node.rch.cat, match_node.cat, key) self.counts[key] += 1
if r is None: return False return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r for file in glob(sys.argv[1]): for bundle in CCGbankReader(file): has_unrecognised_rules, has_ucp = False, False for node in nodes(bundle.derivation): if node.is_leaf(): continue lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node)) comb = analyse(*lrp) l, r, p = lrp rule_tuple = (str(l), str(r), str(p)) if comb: combs[comb] += 1 elif is_ucp(*lrp): ucp_rules[rule_tuple] += 1 if not has_ucp: with_ucp += 1 has_ucp = True else: # Split unrecognised rules by type if r: binary[rule_tuple] += 1 else:
def mkdeps(root, postprocessor=identity): for i, leaf in enumerate(leaves(root)): # Uniquify each leaf with an index leaf.lex += IndexSeparatorTemplate % i # Apply the left to right slash labelling # (we abuse this to refer to slots, not slashes) leaf.cat.parg_labelled() # Populate the outermost (_) variable of each leaf leaf.cat.slot.head.lex = leaf.lex for (l, r, p) in pairs_postorder(root): _label_result(l, r, p) global unanalysed unaries = [] for l, r, p in pairs_postorder(root): L, R, P = map(lambda x: x and x.cat, (l, r, p)) comb = analyse(L, R, P) if not comb: debug("Unrecognised rule %s %s -> %s", L, R, P) unifier = [] if config.debug: debug("%s %s %s (%s)", L, R, P, str(comb)) if comb == 'fwd_appl': # [Xx/Yy]l Yy -> Xx unifier = unify(L.right, R) p.cat = L.left elif comb == 'bwd_appl': # Yy [Xx\Yy]r -> Xx unifier = unify(L, R.right) p.cat = R.left # Pro-drops which drop their outer argument # [(S_\NPy)_/NPx]_ -> [S_\NPy]_ elif comb in ('object_prodrop', 'vp_vp_object_prodrop', 'yi_subject_prodrop', 'vp_modifier_subject_prodrop'): p.cat = L.left # [Xx/Yy]l [Yy/Zz]r -> [Xx/Zz]r elif comb == 'fwd_comp': # X/Y Y/Z -> X/Z if is_rooted_in(Sdcl, L, respecting_features=True): P.slot = L.slot else: P.slot = R.slot # lexical head comes from R (Y/Z) P.slot.var = fresh_var(prefix='K') unifier = unify(L.right, R.left) p.cat._left = L.left p.cat._right = R.right # [Yy\Zz]l [Xx\Yy]r -> [Xx\Zz]l elif comb == 'bwd_comp': # Y\Z X\Y -> X\Z if is_rooted_in(Sdcl, R, respecting_features=True): P.slot = R.slot else: P.slot = L.slot # lexical head comes from L (Y\Z) P.slot.var = fresh_var(prefix='K') unifier = unify(R.right, L.left) p.cat._left = R.left p.cat._right = L.right elif comb in ('s_np_apposition', 'vp_np_apposition'): # { S[dcl], S[dcl]\NP } NPy -> NPy P.slot = R.slot # = copy_vars unifier = unify(P, R) # NP NP -> N/N elif comb == 'np_np_to_nfn_apposition': # do the same as NP NP -> NP, except fill in the vars Ny/Ny P.right.slot.var = fresh_var(prefix='N') P.left.slot = P.right.slot register_unary(unaries, p, L.slot.head.lex) make_set_head_from(l, r, p) elif comb in ('conjoin', 'np_np_apposition'): # X X[conj] -> X make_set_head_from(l, r, p) elif comb in ('conj_absorb', 'conj_comma_absorb'): # conj X -> X[conj] copy_vars(frm=R, to=P) unify(P, R) # R.slot.head = P.slot.head elif comb == 'funny_conj': # conj X -> X p.cat = R elif comb == 'nongap_topicalisation': # {N, NP, S[dcl], QP}x -> [Sy/Sy]x P.slot = L.slot P.right.slot.var = fresh_var() P.left.slot = P.right.slot register_unary(unaries, p, L.slot.head.lex) elif comb in ('np_gap_topicalisation', 's_gap_topicalisation', 'qp_gap_topicalisation'): # NPx -> [ Sy/(Sy/NPx)y ]y P.right.right.slot = L.slot P.slot.var = fresh_var() P.left.slot = P.right.left.slot = P.right.slot = P.slot elif comb == 'subject_prodrop': # (S[dcl]y\NPx)y -> S[dcl]y | [(S[dcl]y\NPx)y/NPz]y -> (S[dcl]y/NPz)y if P == parse_category(r'S[dcl]'): P.slot = L.slot elif P == parse_category(r'S[dcl]/NP'): P.slot = P.left.slot = L.slot P.right.slot = L.right.slot else: warn("Invalid parent category %s for subject prodrop.", P) elif comb == 'fwd_xcomp': # [Xx/Yy]l [Yy\Zz]r -> [Xx/Zz]r if is_rooted_in(Sdcl, L, respecting_features=True): P.slot = L.slot else: P.slot = R.slot # lexical head comes from R (Y/Z) P.slot.var = fresh_var(prefix='K') unifier = unify(L.right, R.left) p.cat._left = L.left p.cat._right = R.right elif comb == 'bwd_xcomp': # [Yy/Zz]l [Xx\Yy]r -> [Xx/Zz]l if is_rooted_in(Sdcl, R, respecting_features=True): P.slot = R.slot else: P.slot = L.slot # lexical head comes from L (Y\Z) # P.slot = L.slot P.slot.var = fresh_var(prefix='K') unifier = unify(R.right, L.left) p.cat._left = R.left p.cat._right = L.right elif comb == 'bwd_r1xcomp': # [(Yy/Zz)k/Ww]l [Xx\Yy]r -> [(Xx\Zz)k/Ww]l # TODO: where should P's lexical head come from? L or R? unifier = unify(L.left.left, R.right) p.cat._left._left = R.left p.cat._left._right = L.left.right p.cat._right = L.right elif comb in ('fwd_raise', 'bwd_raise'): # Xx -> [ Tf|(Tf|Xx)f ]f if P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/(S[dcl]\NP))'): # (S[dcl]y\NPz)y -> [ (S[dcl]f\NPg)f/((S[dcl]f\NPg)f\(S[dcl]y\NPz)y)f ]f P.left.slot.var = P.left.left.slot.var = P.right.slot.var = P.slot.var = fresh_var() # f P.left.right.slot.var = fresh_var() # g copy_vars(frm=P.left, to=P.right.left) copy_vars(frm=L, to=P.right.right) unifier = unify(L, P.right.right) elif P == parse_category(r'((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP)'): # NPy -> [ ((S[dcl]v\NPw)v/QPz)v \ ( ((S[dcl]v\NPw)v/QPz)v/NPy )v ]v P.slot.var = fresh_var() P.left.slot = P.right.slot = \ P.left. left.slot = P.left. left.left.slot = \ P.right.left.slot = P.right.left.left.slot = \ P.right.left.left.left.slot = P.slot # v # P.right.right.slot = fresh_var() # y P.right.right.slot = L.slot P.left.right.slot.var = fresh_var('Z') P.right.left.right.slot = P.left.right.slot # z P.left.left.right.slot.var = fresh_var('W') P.right.left.left.right.slot = P.left.left.right.slot # w unifier = unify(L, P.right.right) elif P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/QP)'): # QPy -> [ (S[dcl]v\NPz)v \ ((S[dcl]v\NPz)v/QPy)v ]v P.slot.var = fresh_var() P.left.slot = P.left.left.slot = \ P.right.slot = P.right.left.slot = P.right.left.left.slot = P.slot # v # P.right.right.slot = fresh_var() # y P.right.right.slot = L.slot P.left.right.slot.var = fresh_var('Z') P.right.left.right.slot = P.left.right.slot # z unifier = unify(L, P.right.right) else: P.slot.var = fresh_var() P.right.left.slot = P.left.slot = P.right.slot = P.slot P.right.right.slot = L.slot unifier = unify(L, P.right.right) elif comb == 'np_typechange': P.slot = L.slot # = copy_vars unifier = unify(P, L) elif comb == 'lcp_np_typechange': P.slot = L.slot unifier = unify(P, L) elif comb in ('lcp_sfs_typechange', 'lcp_nfn_typechange'): P.left.slot.var = fresh_var() P.right.slot = P.left.slot P.slot = L.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'lcp_sbnpfsbnp_typechange': # [(Sy\NPz)y/(Sy\NPz)y]_ P.left.slot.var = fresh_var() P.left.left.slot = P.right.left.slot = P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'null_relativiser_typechange': # Xy -> (Nf/Nf)y P.slot = L.slot if P == _NfN: P.left.slot.var = fresh_var() P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif P == _NfNfNfN: P.left.slot.var = fresh_var() P.left.left.slot.var = fresh_var(prefix="G") P.left.right.slot = P.left.left.slot P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) else: warn("Unhandled null relativiser typechange: %s -> %s", L, P) # [NP/NP]y -> NPy elif comb == 'de_nominalisation': P.slot = L.slot register_unary(unaries, p, L.slot.head.lex) # {M, QP}y -> (Nf/Nf)y elif comb == 'measure_word_number_elision': P.slot = L.slot P.left.slot.var = fresh_var() P.right.slot = P.left.slot register_unary(unaries, p, L.slot.head.lex) elif comb == 'l_punct_absorb': # , X -> X[conj] # need to put conj feature back on parent p.cat = R.clone_adding_feature('conj') elif comb == 'r_punct_absorb': p.cat = L elif R and L == R and is_rooted_in(parse_category('S'), L): # VCD (stopgap) make_set_head_from(l, r, p) else: debug('Unhandled combinator %s (%s %s -> %s)', comb, L, R, P) unanalysed.add(comb) P.slot = R.slot if R else L.slot for (dest, src) in unifier: if isinstance(src, (basestring, list)): # Fake bidirectional unification: # ------------------------------- # If variable X has been unified with value v, # rewrite all mentions of v in the output category to point to variable X # (v is uniquified by concatenating it with an ID, so this should hold) for subcat in p.cat.nested_compound_categories(): if subcat.slot.head.lex == src: subcat.slot = dest.slot if config.debug: debug("> %s" % p.cat) debug('---') if config.fail_on_unassigned_variables: assert no_unassigned_variables(p.cat), "Unassigned variables in %s" % p.cat if config.debug: debug('unaries: %s', unaries) # Collect deps from arguments deps = [] for l in chain( leaves(root), unaries ): if config.debug: debug("%s %s", l, l.cat) C = l.cat while not C.is_leaf(): arg = C.right if arg.slot.head.filler: #and not l.cat.left.slot == l.cat.right.slot): # print "%s %s %s %s %s %s" % (C.slot.head.lex, C, arg.slot.head.lex, arg, l.cat, C.label) if C.label is None: warn("Dependency generated on slash without label: %s %s", C, arg) deps.append( (C.slot.head.lex, arg.slot.head.lex, l.cat, C.label) ) if is_modifier(C): break C = C.left # Produce dep pairs result = set() for depl, depr, head_cat, head_label in deps: for sdepl in set(seqify(depl)): for sdepr in set(seqify(depr)): if not (sdepl and sdepr): debug("Dependency with None: %s %s", sdepl, sdepr) continue result.add( (postprocessor(sdepl), postprocessor(sdepr), head_cat, head_label) ) if config.debug: for line in write_deps(result): debug(line) return result