def signature(node, d): # return base_tag(k[0].tag) + ' ' + base_tag(k[-1].tag) dnp_index = [i for (i, k) in enumerate(node) if k.tag == 'DNP'] last_nonpunct_tag = [ k.tag for (i, k) in enumerate(node) if i < dnp_index and k.tag not in ('PU', 'PRN', 'FLR') ][-1] return ' '.join((base_tag(d[0].tag), base_tag(last_nonpunct_tag)))
def accept_derivation(self, bundle): for node in nodes(bundle.derivation): if node.is_leaf(): continue if is_coordination(node): ccs = list(where(lambda kid: kid.tag == 'CC', node.kids)) for cc in ccs: self.conjs[base_tag(node.tag)][cc.lex] += 1 self.inverse[cc.lex][base_tag(node.tag)] += 1
def format(self, leaf): return self.format_string % { 'lex': leaf.lex, 'pos': leaf.pos1, 'cat': str(leaf.cat), 'stemmed_pos': base_tag(leaf.pos1) }
def ccgbank_repr(self): return "(<L %(cat)s %(basetag)s %(basetag)s %(lex)s %(cat)s>)" % { 'cat': self.category, 'basetag': base_tag(self.tag), 'tag': self.detag(self.tag), 'lex': self.lex }
def accept_derivation(self, bundle): self.nderivs += 1 self.nwords += len(bundle.derivation.text()) for leaf in leaves(bundle.derivation): if self.is_trace(leaf): self.ecs += 1 self.ec_types[base_tag(leaf.lex)] += 1 else: self.tokens.add(leaf.lex)
def accept_derivation(self, bundle): def kids_have_same_tag(node): def tags_are_equal(t1, t2): if t1[0] == 'V' and t2[0] == 'V': return True if t1[0] == 'N' and t2[0] == 'N': return True return t1 == t2 return all(tags_are_equal(node[0].tag, other.tag) for other in node[1:]) self.nderivs += 1 for node in nodes(bundle.derivation): if (node.count() > 1 and (not node.tag.startswith('NP')) and (not node.tag.startswith('ADJP')) and (not node.tag.startswith('FRAG')) and (not node.tag.startswith('FLR')) and (not base_tag(node.tag) in ('VCD', 'VRD', 'VCP', 'VNV', 'VPT', 'VSB')) and (not kids_have_same_tag(node)) and all(base_tag(kid.tag) in WordTags for kid in node)): self.nbad += 1 print node break
def is_right_absorption(node): '''The CPTB annotation has some (possibly noisy) nodes which look like CCGbank-style right absorption: this returns True for cases of this.''' return node.count() == 2 and base_tag(node.tag) == base_tag(node[0].tag) and node[1].tag == 'PU'
def stem_tag(tag): if tag.startswith('V') and tag[1] in 'VACE': return 'V' else: return base_tag(tag)
def format(self, leaf): return self.format_string % {'lex': leaf.lex, 'pos': leaf.pos1, 'cat': str(leaf.cat), 'stemmed_pos': base_tag(leaf.pos1)}
def __init__(self, var): AtomValue.__init__(self, var, lambda a, b: base_tag(a.cat) == base_tag(b.cat))
def is_right_absorption(node): '''The CPTB annotation has some (possibly noisy) nodes which look like CCGbank-style right absorption: this returns True for cases of this.''' return node.count() == 2 and base_tag(node.tag) == base_tag( node[0].tag) and node[1].tag == 'PU'
def signature(node): def ignored(tag): return tag == 'AS' or tag == 'PU' or tag == 'PRN' or tag == 'FLR' return ' '.join(base_tag(k.tag) for k in node[1:] if not ignored(k.tag))
def is_repeated_unary_projection(tag, node): '''True if _node_ has _tag_, and the unary child of _node_ also has _tag_.''' return node.tag.startswith(tag) and node.count() == 1 and base_tag(node[0].tag) == tag and not node[0].is_leaf()
def is_partial_ucp(node): return ((node[0].is_leaf() and (node[0].tag.startswith('CC') or node[0].tag == 'PU') and has_tag(node[1], 'C')) and base_tag(node.tag) != base_tag(node[1].tag))
def signature(node, d): # return base_tag(k[0].tag) + ' ' + base_tag(k[-1].tag) dnp_index = [i for (i, k) in enumerate(node) if k.tag == 'DNP'] last_nonpunct_tag = [k.tag for (i, k) in enumerate(node) if i < dnp_index and k.tag not in ('PU', 'PRN', 'FLR')][-1] return ' '.join( (base_tag(d[0].tag), base_tag(last_nonpunct_tag)) )
def is_repeated_unary_projection(tag, node): '''True if _node_ has _tag_, and the unary child of _node_ also has _tag_.''' return node.tag.startswith(tag) and node.count() == 1 and base_tag( node[0].tag) == tag and not node[0].is_leaf()
def signature(node): return ' '.join([base_tag(node.tag)] + [base_tag(kid.tag) for kid in node])