def get_cluster_info(cluster, gold_doc):
	text = gold_doc['text']
	gold_ner = gold_doc['ner']

	ner, number, person, gender = set(), set(), set(), set()
	for mention in cluster:
		mtext = coreference_rendering.mention_text(text, mention).lower()
		tgender, tnumber, tperson = coreference.pronoun_properties_text(mtext)
		if tgender != 'unknown':
			gender.add(tgender)
		if tnumber != 'unknown':
			number.add(tnumber)
		if tperson != 'unknown':
			person.add(tperson)
		if mention in gold_ner:
			ner.add(gold_ner[mention])
	return ner, number, person, gender
def print_pre_change_info(out, auto, gold, auto_mentions, gold_mention_set, text, parses, heads, gold_clusters, gold_mentions, gold_doc, auto_clusters):
	# Cataphora
	mentions = defaultdict(lambda: [None, None, None])

	for cluster in gold:
		non_pronoun = min_non_pronoun(cluster, text, parses, heads)
		for mention in cluster:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				if non_pronoun is not None and mention < non_pronoun:
					mentions[mention][0] = True
				else:
					mentions[mention][0] = False

	for cluster in auto:
		non_pronoun = min_non_pronoun(cluster, text, parses, heads)
		for mention in cluster:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				if non_pronoun is not None and mention < non_pronoun:
					mentions[mention][1] = True
				else:
					mentions[mention][1] = False

	in_both = []
	for mention in mentions:
		if mentions[mention][0] and mentions[mention][1]:
			in_both.append(mention)
	for mention in in_both:
		acluster = auto_clusters[auto_mentions[mention]]
		gcluster = gold_clusters[gold_mentions[mention]]
		anon_pronoun = min_non_pronoun(acluster, text, parses, heads)
		gnon_pronoun = min_non_pronoun(gcluster, text, parses, heads)
		if anon_pronoun == gnon_pronoun:
			mentions[mention][2] = True
		else:
			mentions[mention][2] = False

	for mention in mentions:
		mtext = coreference_rendering.mention_text(text, mention).lower()
		print >> out['out'], "Cataphoric properties", mentions[mention], mtext
def mention_error_properties(mention, cluster, text, parses, heads, gold_doc):
	ans = []
	rest = cluster.difference({mention})

	# Type of mention
	mtype = coreference.mention_type(mention, text, parses, heads)
	ans.append(mtype)

	# Text of mention
	mtext = coreference_rendering.mention_text(text, mention).lower()
	ans.append('_'.join(mtext.split()))

	# Does it have a string match with something in the cluster?
	matches = 'no_text_match'
	for omention in rest:
		otext = coreference_rendering.mention_text(text, omention).lower()
		if otext == mtext:
			matches = 'text_match'
			break
	ans.append(matches)

	# Does it have a head match with something in the cluster?
	matches = 'no_head_match'
	mhead = coreference.mention_head(mention, text, parses, heads)[1].lower()
	for omention in rest:
		ohead = coreference.mention_head(omention, text, parses, heads)[1].lower()
		if mhead == ohead:
			matches = 'head_match'
			break
	ans.append(matches)

	# Is it nested within another mention in the cluster
	nested = 'not_nested'
	for omention in rest:
		if omention[0] == mention[0]:
			if mention[1] < omention[1] and omention[2] < mention[2]:
				if nested == 'nested_inside':
					nested = 'nested_both'
					break
				else:
					nested = 'nested_outside'
			if omention[1] < mention[1] and mention[2] < omention[2]:
				if nested == 'nested_outside':
					nested = 'nested_both'
					break
				else:
					nested = 'nested_inside'
	ans.append(nested)

	# Was it first in the cluster?
	ans.append(mention == min(cluster))

	# Was it last in the cluster?
	ans.append(mention == max(cluster))

	# Is it a case of cataphora?
	non_pronoun = min_non_pronoun(cluster, text, parses, heads)
	ans.append(non_pronoun is not None and mention < non_pronoun)

	# Do NER, number, person, or gender of mention and cluster match?
	cluster_properties = get_cluster_info(rest, gold_doc)
	mention_properties = get_cluster_info({mention}, gold_doc)
	words = ['ner', 'number', 'person', 'gender']
	for i in xrange(4):
		if len(mention_properties[i]) == 0 or len(cluster_properties[i]) == 0:
			ans.append(words[i] + '_unknown')
		elif len(mention_properties[i].intersection(cluster_properties[i])) > 0:
			ans.append(words[i] + '_matches')
		else:
			ans.append(words[i] + '_does_not_match')

	return ans
def cluster_error_properties(cluster, text, parses, heads, gold_doc):
	ans = []

	# How big is the cluster
	ans.append(len(cluster))

	# Counts of each type in the cluster
	counts = [0, 0, 0]
	for mention in cluster:
		mtype = coreference.mention_type(mention, text, parses, heads)
		if mtype == 'name':
			counts[0] += 1
		elif mtype == 'nominal':
			counts[1] += 1
		elif mtype == 'pronoun':
			counts[2] += 1
	ans += counts

	# If it is one pronoun and something else, more info on the pronoun
	if counts[0] + counts[1] == 1 and counts[2] == 1:
		pronoun = None
		for mention in cluster:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				pronoun = mention
		mtext = coreference_rendering.mention_text(text, pronoun).lower()
		ans.append(mtext)
	else:
		ans.append(None)

	# Number of cataphoric pronouns
	cataphora = 0
	non_pronoun = min_non_pronoun(cluster, text, parses, heads, True)
	for mention in cluster:
		if mention < non_pronoun:
			mtype = coreference.mention_type(mention, text, parses, heads)
			if mtype == 'pronoun':
				cataphora += 1
	ans.append(cataphora)

	# NER types
	ner = set()
	for mention in cluster:
		if mention in gold_doc['ner']:
			ner.add(gold_doc['ner'][mention])
	ner = list(ner)
	ner.sort()
	ans.append(ner)

	# Are all the mentions the same?
	mtext = set()
	for mention in cluster:
		mtext.add(coreference_rendering.mention_text(text, mention).lower())
	ans.append(len(mtext) == 1)

	# Are all the heads the same?
	mhead = set()
	for mention in cluster:
		mhead.add(coreference.mention_head(mention, text, parses, heads)[1].lower())
	ans.append(len(mhead) == 1)

	return ans
def split_merge_properties(part, cluster, auto, gold, text, parses, heads, gold_mentions, gold_clusters, auto_mentions, gold_doc):
	ans = []
	rest = cluster.difference(part)

	# Size of part
	ans.append(len(part)) # 0

	# Size of rest
	ans.append(len(rest)) # 1

	# If size 1, what the text is
	mtext = None
	if len(part) == 1:
		mention = iter(part).next()
		mtext = '_'.join(coreference_rendering.mention_text(text, mention).lower().split())
	ans.append(mtext) # 2

	# Does this part have any cataphoric pronouns
	count = 0
	acluster = set()
	for mention in cluster:
		if mention in auto_mentions:
			acluster.add(mention)
	non_pronoun = min_non_pronoun(acluster, text, parses, heads)
	if non_pronoun is not None and non_pronoun not in part:
		for mention in part:
			if mention in auto_mentions and mention < non_pronoun:
				mtype = coreference.mention_type(mention, text, parses, heads)
				if mtype == 'pronoun':
					count += 1
	ans.append("%d_cataphoric" % count)

	# Number of pronouns, nominals, names present in it
	type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0}
	for mention in part:
		mtype = coreference.mention_type(mention, text, parses, heads)
		type_counts[mtype] += 1
	ans.append(type_counts['name']) # 3
	ans.append(type_counts['nominal']) # 4
	ans.append(type_counts['pronoun']) # 5

	# Number of pronouns, nominals, names, in rest
	type_counts = {'pronoun': 0, 'name': 0, 'nominal': 0}
	for mention in rest:
		mtype = coreference.mention_type(mention, text, parses, heads)
		type_counts[mtype] += 1
	ans.append(type_counts['name']) # 6
	ans.append(type_counts['nominal']) # 7
	ans.append(type_counts['pronoun']) # 8

	# Whether this is extra
	all_extra = True
	for mention in part:
		if mention in gold_mentions:
			all_extra = False
	ans.append(all_extra) # 9

	# Whether the rest is all extra
	all_extra = True
	for mention in rest:
		if mention in gold_mentions:
			all_extra = False
	ans.append(all_extra) # 10

	# Whether there is an exact string match between a mention in the part and cluster (excluding pronouns)
	match_present = 'no_string_match'
	for smention in part:
		mtype = coreference.mention_type(smention, text, parses, heads)
		if mtype == 'pronoun':
			continue
		for rmention in rest:
			mtype = coreference.mention_type(rmention, text, parses, heads)
			if mtype == 'pronoun':
				continue
			stext = coreference_rendering.mention_text(text, smention).lower()
			rtext = coreference_rendering.mention_text(text, rmention).lower()
			if stext == rtext:
				match_present = 'string_match'
				break
		if 'no' not in match_present:
			break
	ans.append(match_present) # 11

	# Whether there is a head match between a mention in the part and cluster (excluding pronouns)
	match_present = 'no_head_match'
	for smention in part:
		mtype = coreference.mention_type(smention, text, parses, heads)
		if mtype == 'pronoun':
			continue
		for rmention in rest:
			mtype = coreference.mention_type(rmention, text, parses, heads)
			if mtype == 'pronoun':
				continue
			shead = coreference.mention_head(smention, text, parses, heads)[1].lower()
			rhead = coreference.mention_head(rmention, text, parses, heads)[1].lower()
			if shead == rhead:
				match_present = 'head_match'
				break
		if 'no' not in match_present:
			break
	ans.append(match_present) # 12

	# What has happened, or will happen
	example = iter(part).next()
	action = 'nothing'
	if example not in gold_mentions:
		action = 'delete'
	elif part != set(gold_clusters[gold_mentions[example]]):
		action = 'merge'
	ans.append(action) # 13

	action = 'nothing'
	if example not in auto_mentions:
		action = 'introduce'
	else:
		for acluster in auto:
			if example in acluster:
				if acluster != part:
					action = 'split'
				break
	ans.append(action) # 14

	# NER, number, person, gender
	cproperties = get_cluster_info(rest, gold_doc)
	pproperties = get_cluster_info(part, gold_doc)
	for prop in xrange(4):
		ans.append(cproperties[prop] == pproperties[prop])
		cprop = list(cproperties[prop])
		cprop.sort()
		pprop = list(pproperties[prop])
		pprop.sort()
		ans.append('part_' + '_'.join(pprop))
		ans.append('cluster_' + '_'.join(cprop))

	return ans