Ejemplo n.º 1
0
def compare_trees(gold_tree, test_tree, out_dict, error_counts, classify):
    """ Compares two trees. """
    init_errors = parse_errors.get_errors(test_tree, gold_tree)
    error_count = len(init_errors)
    print >> out_dict['out'], "{} Initial errors".format(error_count)
    iters, path = greedy_search(gold_tree, test_tree, classify)
    print >> out_dict['out'], "{} on fringe, {} iterations".format(*iters)
    if path is not None:
        print >> out_dict['test_trees'], test_tree
        print >> out_dict['gold_trees'], gold_tree
        for tree in path[1:]:
            print >> out_dict['out'], "{} Error:{}".format(
                str(tree[2]), tree[1]['classified_type'])

        if len(path) > 1:
            for tree in path:
                print >> out_dict['out'], "Step:{}".format(
                    tree[1]['classified_type'])
                error_counts[tree[1]['classified_type']].append(tree[2])
                print >> out_dict['out'], tree[1]
                print >> out_dict['out'], render_tree.text_coloured_errors(
                    tree[0], gold=gold_tree).strip()
    else:
        print >> out_dict['out'], "no path found"
    print >> out_dict['err'], ""
    print >> out_dict['out'], ""
Ejemplo n.º 2
0
def greedy_search(gold, test, classify):
    # Initialise with the test tree
    cur = (test.clone(), {'type': 'init'}, 0)

    # Search while there is still something in the fringe
    iters = 0
    path = []
    while True:
        path.append(cur)
        if iters > 100:
            return (0, iters), None
        # Check for victory
        ctree = cur[0]
        cerrors = parse_errors.ParseErrorSet(gold, ctree)
        if len(cerrors) == 0:
            final = cur
            break

        best = None
        for fixes, ntree, info in successors(ctree, cerrors, gold):
            if not ntree.check_consistency():
                raise Exception("Inconsistent tree! {}".format(ntree))
            nerrors = parse_errors.get_errors(ntree, gold)
            change = len(cerrors) - len(nerrors)
            if change < 0:
                continue
            if best is None or change > best[2]:
                best = (ntree, info, change)
        cur = best
        iters += 1

    for step in path:
        classify(step[1], gold, test)

    return (0, iters), path
def greedy_search(gold, test, classify):
	# Initialise with the test tree
	cur = (test.clone(), {'type': 'init'}, 0)

	# Search while there is still something in the fringe
	iters = 0
	path = []
	while True:
		path.append(cur)
		if iters > 100:
			return (0, iters), None
		# Check for victory
		ctree = cur[0]
		cerrors = parse_errors.Parse_Error_Set(gold, ctree)
		if len(cerrors) == 0:
			final = cur
			break

		best = None
		for fixes, ntree, info in successors(ctree, cerrors, gold):
			if not ntree.check_consistency():
				raise Exception("Inconsistent tree! {}".format(ntree))
			nerrors = parse_errors.get_errors(ntree, gold)
			change = len(cerrors) - len(nerrors)
			if change < 0:
				continue
			if best is None or change > best[2]:
				best = (ntree, info, change)
		cur = best
		iters += 1
	
	for step in path:
		classify(step[1], gold, test)
	
	return (0, iters), path
def compare_trees(gold_tree, test_tree, out_dict, error_counts, classify):
	""" Compares two trees. """
	init_errors = parse_errors.get_errors(test_tree, gold_tree)
	error_count = len(init_errors)
	print >> out_dict['out'], "{} Initial errors".format(error_count)
	iters, path = greedy_search(gold_tree, test_tree, classify)
	print >> out_dict['out'], "{} on fringe, {} iterations".format(*iters)
	if path is not None:
		print >> out_dict['test_trees'], test_tree
		print >> out_dict['gold_trees'], gold_tree
		for tree in path[1:]:
			print >> out_dict['out'], "{} Error:{}".format(str(tree[2]),tree[1]['classified_type'])

		if len(path) > 1:
			for tree in path:
				print >> out_dict['out'], "Step:{}".format(tree[1]['classified_type'])
				error_counts[tree[1]['classified_type']].append(tree[2])
				print >> out_dict['out'], tree[1]
				print >> out_dict['out'], render_tree.text_coloured_errors(tree[0], gold=gold_tree).strip()
	else:
		print >> out_dict['out'], "no path found"
	print >> out_dict['err'], ""
	print >> out_dict['out'], ""
def text_coloured_errors(tree,
		gold=None,
		depth=0,
		single_line=False,
		missing=None,
		extra=None,
		compressed=True,
		POS=True):
	"""Pretty print, with errors marked using colour.

	'missing' should contain tuples (or be None):
		(start, end, label, crossing-T/F)
	"""
	# TODO: Add the ability to compress the same parts consistently (even after
	# errors are no longer present). This would need to be span based as
	# structure could change.
	ans = ''
	if missing is None or extra is None:
		if gold is None:
			return "Error - no gold tree and no missing list for colour repr"
		# look at gold and work out what missing should be
		errors = parse_errors.get_errors(tree, gold, POS)
		extra = [e[3] for e in errors if e[0] == 'extra' and e[3].word is None]
		extra = set(extra)
		missing = [(e[1][0], e[1][1], e[2], False) for e in errors
				if e[0] == 'missing' and e[3].word is None]
		missing += [(e[1][0], e[1][1], e[2], True) for e in errors
				if e[0] == 'crossing' and e[3].word is None]
		POS = [e for e in errors if e[0] == 'diff POS']
	start_missing = "\033[01;36m"
	start_extra = "\033[01;31m"
	start_crossing = "\033[01;33m"
	end_colour = "\033[00m"

	if not single_line:
		ans += '\n' + depth * '\t'

	# start of this
	if tree in extra:
		ans += start_extra + '(' + tree.label + end_colour
	elif tree.word is not None and POS is not None:
		found = False
		for error in POS:
			if error[3] == tree:
				found = True
				ans += '(' + start_missing + error[4] + end_colour
				ans += ' ' + start_extra + tree.label + end_colour
				break
		if not found:
			ans += '(' + tree.label
	else:
		ans += '(' + tree.label

	# If we are compressing, check for correctness and then just print words
	sub_done = False
	if compressed and tree not in extra and tree.word is None:
		all_right = True
		for error in extra:
			if tree.span[0] <= error.span[0] and error.span[1] <= tree.span[1]:
				all_right = False
				break
		for error in missing:
			if error[3]:
				if tree.span[0] < error[0] < tree.span[1]:
					all_right = False
					break
				if tree.span[0] < error[1] < tree.span[1]:
					all_right = False
					break
			elif tree.span[0] <= error[0] and error[1] <= tree.span[1]:
				all_right = False
				break
		if POS is not None:
			for error in POS:
				if tree.span[0] <= error[1][0] and error[1][1] <= tree.span[1]:
					all_right = False
					break
		if all_right:
			ans += ' ' + text_words(tree) + ')'
			sub_done = True

	# crossing brackets starting
	if tree.parent is None or tree.parent.subtrees[0] != tree:
		# these are marked as high as possible
		labels = []
		for error in missing:
			if error[0] == tree.span[0] and error[3]:
				labels.append((error[1], error[2]))
		labels.sort(reverse=True)
		if len(labels) > 0:
			to_add = start_crossing + ' '.join(
					['(' + label[1] for label in labels]) + end_colour
			if sub_done:
				nans = ''
				for char in ans:
					if char in '\t\n':
						nans += char
				clen = len(nans)
				nans += to_add
				nans += ' ' + ans[clen:]
				ans = nans
			else:
				ans += ' ' + to_add

	if not sub_done:
		# word
		if tree.word is not None:
			ans += ' ' + tree.word

		# subtrees
		below = []
		for subtree in tree.subtrees:
			text = text_coloured_errors(subtree, gold, depth + 1, single_line,
					missing, extra, compressed, POS)
			if single_line:
				text = ' ' + text
			below.append([subtree.span[0], subtree.span[1], text])
		# add missing brackets that surround subtrees
		for length in range(1, len(below)):
			for i in range(len(below)):
				j = i + length
				if i == 0 and j == len(below) - 1:
					continue
				if j >= len(below):
					break
				for error in missing:
					if below[i][0] == error[0] and below[j][1] == error[
							1] and not error[3]:
						start = ''
						for char in below[i][2]:
							if char not in '\n\t':
								break
							start += char
						for k in range(i, j + 1):
							below[k][2] = '\n\t'.join(below[k][2].split('\n'))
						below[i][2] = start + start_missing + '(' + error[
								2] + end_colour + below[i][2]
						below[j][2] += start_missing + ')' + end_colour
		ans += ''.join([part[2] for part in below])

		# end of this
		if tree in extra:
			ans += start_extra + ')' + end_colour
		else:
			ans += ')'

	if tree.parent is None or tree.parent.subtrees[-1] != tree:
		# if there are crossing brackets that end here, mark that
		labels = []
		for error in missing:
			if error[1] == tree.span[1] and error[3]:
				labels.append((-error[0], error[2]))
		labels.sort()
		if len(labels) > 0:
			ans += ' ' + start_crossing + ' '.join(
					[label[1] + ')' for label in labels]) + end_colour

	# TODO: Change so that at the top level,
	# FRAG etc isn't printed outside of ROOT
	# Actually, just have a canonical ordering for unaries
	# (so that NPs end up under FRAGs)
	if tree.parent is None or len(tree.parent.subtrees) > 1:
		# check for missing brackets that go around this node
		for error in missing:
			if (error[0] == tree.span[0]
					and error[1] == tree.span[1] and not error[3]):
				if tree not in extra:
					# Put them on a new level
					extra_text = ''
					if not single_line:
						ans = '\n\t'.join(ans.split('\n'))
						extra_text = '\n' + depth * '\t'
					extra_text += start_missing + '(' + error[2] + end_colour
					if single_line:
						ans = ' ' + ans
					ans = extra_text + ans
					ans += start_missing + ')' + end_colour
				else:
					# Put them on the same line
					start = 0
					for char in ans:
						if char not in '\n\t':
							break
						start += 1
					pretext = ans[:start]
					ans = ans[start:]
					extra_text = start_missing + '(' + error[
							2] + end_colour + ' '
					ans = pretext + extra_text + ans
					ans += start_missing + ')' + end_colour
	return ans