def extra(error, test_tree, to_group, to_add, ungrouped): # Get the bracket in the tree that corresponds to this error ctree = bracket_errors.get_extra_tree(error, test_tree) if ctree is None: print 'Did not find the matching extra bracket' print >> sys.stderr, 'Did not find the matching extra bracket' print error print test_tree # Find all errors that cross this bracket crossing_errors = [] for merror in ungrouped: if merror.missing and bracket_errors.error_crosses_bracket(merror, ctree): crossing_errors.append(merror) if len(crossing_errors) > 0: # sort them into those that start here and those that end here ending = {} starting = {} other = [] for merror in crossing_errors: if ctree.span[0] < merror.node.span[0] < ctree.span[1] < merror.node.span[1]: start = merror.node.span[0] if start not in starting: starting[start] = [] starting[start].append(merror) elif merror.node.span[0] < ctree.span[0] < merror.node.span[1] < ctree.span[1]: end = merror.node.span[1] if end not in ending: ending[end] = [] ending[end].append(merror) else: other.append(merror) if len(starting) == 0 and len(ending) == 1 and len(other) == 0: return extra_crossing_ending(error, test_tree, to_group, ending, ungrouped, ctree) elif len(starting) == 1 and len(ending) == 0 and len(other) == 0: return extra_crossing_starting(error, test_tree, to_group, starting, ungrouped, ctree) elif len(starting) > 1 and len(ending) == 0: return extra_multicrossing_starting(error, test_tree, to_group, starting, ungrouped, ctree) else: # there could be a mixture of starting and ending # of multiple starting points, and multiple ending points pass else: # no crossing errors # find the smallest missing error that covers this extra error shortest_error = None snode = None for merror in ungrouped: if merror.missing: mnode = merror.node if mnode.span[0] <= error.node.span[0] and error.node.span[1] <= mnode.span[1]: if snode is None or (snode.span[0] <= mnode.span[0] and mnode.span[1] <= snode.span[1]): shortest_error = merror snode = merror.node # Check that there are no spans that are over the extra and under the missing intermediate_spans = False shortest_error is None if shortest_error is not None: if shortest_error.node.span[0] < ctree.parent.span[0] and ctree.parent.span[1] <= shortest_error.node.span[1]: if not ctree.parent.extra: intermediate_spans = True elif shortest_error.node.span[0] <= ctree.parent.span[0] and ctree.parent.span[1] < shortest_error.node.span[1]: if not ctree.parent.extra: intermediate_spans = True ### print 'considering' ### print error ### print shortest_error if shortest_error is None: intermediate_spans = True if not intermediate_spans and shortest_error.node.label == ctree.label: # we have a matching missing error ### print test_tree ### print shortest_error if bracket_errors.error_crosses_bracket(shortest_error, test_tree): ### print 'crossing' return extra_matching_crossing_miss(error, test_tree, shortest_error, ungrouped, to_group) else: ### print 'not crossing' return extra_matching_miss(error, test_tree, shortest_error, ctree, to_group) else: ### if shortest_error.node.label == ctree.label: ### if ctree.parent.extra return extra_no_matching(error, test_tree, ctree, to_group) return None, test_tree
def extra_multicrossing_starting(error, test_tree, to_group, starting, ungrouped, ctree): '''Extra, then if there are crossing brackets that start here, and no crossing bracket that ends at the same spot, the other thing under this bracket has something that should have attached to it, but attached too high. Consider what would happen if it had attached here and see what other errors it fixes (ie this extra may now match with a missing bracket above)''' ### print error ### print ctree # find the longest crossing missing bracket that starts here start = starting.keys()[0] cend = ctree.span[1] crossing_errors = starting[start] longest_error = None for merror in crossing_errors: if longest_error is None or longest_error.node.span[1] < merror.node.span[1]: longest_error = merror mspan = (cend, longest_error.node.span[1]) ### print mspan # find the set of missing brackets that end where that one ends ### print "Related missing:" related_missing = [] for merror in ungrouped: if merror.missing: if merror.node.span[1] == longest_error.node.span[1]: related_missing.append((merror.node.span, merror)) ### print merror related_missing.sort() # find the set of extra brackets that end where this one ends ### print "Related extra:" related_extra = [] for eerror in ungrouped: if eerror.extra: current_node = bracket_errors.get_extra_tree(eerror, test_tree) if current_node.span[1] == ctree.span[1]: related_extra.append((current_node.span, eerror)) ### print current_node ### print eerror related_extra.sort() # find the lowest pairing lowest = None for pair in related_extra: for mpair in related_missing: if mpair[1].node.label == pair[1].node.label: if mpair[1].node.span[0] == pair[1].node.span[0]: lowest = pair[1] break if lowest is None: return None, test_tree ### print lowest # find all the parts that start in the missing bracket to be here moving = [] while cend < mspan[1]: brac = test_tree done = False while not done: for subtree in brac.subtrees: if cend == subtree.span[0]: moving.append(subtree) done = True cend = subtree.span[1] break if subtree.span[0] < cend < subtree.span[1]: brac = subtree break # move them across group_fields = {} group_fields['type'] = 'attachment' group_fields['height'] = 'too high' group_fields['from parent'] = moving[0].parent.label addendum = [] target = bracket_errors.get_extra_tree(lowest, test_tree) group_desc = 'attachment too_high %s_instead_of_%s' % (moving[0].parent.label, target.label) group_fields['to parent'] = target.label single_child_parents = [] for node in moving: parent = node.parent parent.subtrees.remove(node) # if the parent now has only one child, look into whether it should be deleted if len(parent.subtrees) == 1: if parent.label == parent.subtrees[0].label: single_child_parents.append(parent) target.subtrees.append(node) node.parent = target addendum.append(node.label) group_fields['nodes moving'] = ' '.join(addendum) group_desc += ' ' + '_'.join(addendum) test_tree.update_span() for parent in single_child_parents: if len(parent.subtrees) == 1: if parent.subtrees[0].extra and parent.label == parent.subtrees[0].label: eerror = bracket_errors.get_extra_error(ungrouped, parent.subtrees[0]) repair_tree.repair_extra_node(eerror, test_tree) to_group.append(eerror) # attempt to repair the longest crossing error if target == ctree: if repair_tree.repair_missing_node(longest_error, test_tree, failure_expected=True): to_group.append(longest_error) group_desc += ' |emcs1' group_fields['ID'] = 'emcs1' group_fields['old desc'] = group_desc return group_fields, test_tree
def extra_matching_crossing_miss(error, test_tree, shortest_error, ungrouped, to_group): if shortest_error.node.span[1] == error.node.span[1]: moving = [] mspan = shortest_error.node.span cend = error.node.span[0] while cend > mspan[0]: brac = test_tree done = False while not done: for subtree in brac.subtrees: if cend == subtree.span[1] and subtree.span[0] >= mspan[0]: moving.append(subtree) done = True cend = subtree.span[0] break if subtree.span[0] < cend <= subtree.span[1]: brac = subtree break # move them across group_fields = {} group_fields['type'] = 'attachment' group_fields['height'] = 'incorrect' group_fields['from parents'] = '' ### print ### print "Moving" for node in moving: group_fields['from parents'] += ' ' + node.parent.label ### print node addendum = [] target = bracket_errors.get_extra_tree(error, test_tree) target.extra = False ### print "To:", target ### print 'error is:', error group_desc = 'attachment incorrect %s_instead_of_%s' % (moving[0].parent.label, target.label) group_fields['to parent'] = target.label single_child_parents = [] for node in moving: parent = node.parent node.parent.subtrees.remove(node) # if the parent now has only one child, look into whether it should be deleted if len(parent.subtrees) == 1: if parent.label == parent.subtrees[0].label: single_child_parents.append(parent) target.subtrees.insert(0, node) node.parent = target addendum.append(node.label) group_fields['nodes moving'] = ' '.join(addendum) group_desc += ' ' + '_'.join(addendum) test_tree.update_span() for parent in single_child_parents: if len(parent.subtrees) == 1: if parent.subtrees[0].extra and parent.label == parent.subtrees[0].label: eerror = bracket_errors.get_extra_error(ungrouped, parent.subtrees[0]) if eerror is not None: repair_tree.repair_extra_node(eerror, test_tree) to_group.append(eerror) to_group.append(error) to_group.append(shortest_error) group_desc += ' |emcm1' group_fields['ID'] = 'emcm1' group_fields['old desc'] = group_desc test_tree.check_consistency() return group_fields, test_tree return None, test_tree
def extra_crossing_ending(error, test_tree, to_group, ending, ungrouped, ctree): '''Extra, then if there is a crossing bracket that ends in the middle of here, the other thing under this bracket is attaching too low. This could explain a bunch of other errors. In particular, consider if the wrongly attached thing was collapsed to 0, what would that fix (note that the extra bracket may still be extra at this point, or may now be equivalent to a msising bracket).''' ### print error # work out what needs to move end = ending.keys()[0] crossing_errors = ending[end] ### for cerror in crossing_errors: ### print cerror # Check the case of a matching missing bracket ### print error ### print ending if len(ending[end]) == 1: for merror in ungrouped: if merror.missing and merror.node.label == error.node.label: if merror.node.span[1] == error.node.span[1]: if ending[end][0].node.span[0] == merror.node.span[0]: # the other things should be moving under here! ### print merror ### print error moving = [] target = bracket_errors.get_extra_tree(error, test_tree) mspan = merror.node.span cend = target.span[0] while cend > mspan[0]: brac = test_tree done = False while not done: for subtree in brac.subtrees: if cend == subtree.span[1] and subtree.span[0] >= mspan[0]: moving.append(subtree) done = True cend = subtree.span[0] break if subtree.span[0] < cend <= subtree.span[1]: brac = subtree break ### print "Moving" ### for node in moving: ### print node ### print "To:" ### print target # move them across group_fields = {} group_fields['type'] = 'attachment' group_fields['height'] = 'incorrect' group_fields['from parents'] = '' for node in moving: group_fields['from parents'] += ' ' + node.parent.label addendum = [] group_desc = 'attachment incorrect %s_instead_of_%s' % (moving[0].parent.label, target.label) group_fields['to parent'] = target.label single_child_parents = [] for node in moving: parent = node.parent parent.subtrees.remove(node) # if the parent now has only one child, look into whether it should be deleted if len(parent.subtrees) == 1: if parent.label == parent.subtrees[0].label: single_child_parents.append(parent) target.subtrees.insert(0, node) node.parent = target addendum.insert(0, node.label) group_fields['nodes moving'] = ' '.join(addendum) group_desc += ' ' + '_'.join(addendum) test_tree.update_span() for parent in single_child_parents: if len(parent.subtrees) == 1: if parent.subtrees[0].extra and parent.label == parent.subtrees[0].label: eerror = bracket_errors.get_extra_error(ungrouped, parent.subtrees[0]) repair_tree.repair_extra_node(eerror, test_tree) to_group.append(eerror) target.extra = False if error not in to_group: to_group.append(error) to_group.append(merror) group_desc += ' |ece2' group_fields['ID'] = 'ece2' group_fields['old desc'] = group_desc test_tree.check_consistency() return group_fields, test_tree # work out where it is going to move to # first find the longest crossing error longest_error = None for merror in crossing_errors: if longest_error is None or merror.node.span[0] < longest_error.node.span[0]: longest_error = merror end = longest_error.node.span[1] ### print "getting movers from:", ctree ### print "after:", end, ctree.span cend = end moving = [] while cend < ctree.span[1]: brac = test_tree done = False while not done: for subtree in brac.subtrees: if cend == subtree.span[0] and subtree.span[0] <= ctree.span[1]: moving.append(subtree) done = True cend = subtree.span[1] break if subtree.span[0] <= cend < subtree.span[1]: brac = subtree break ### print "Moving:" ### for mover in moving: ### print mover # then see how far up we can go to it parent = ctree while parent.span[1] == ctree.span[1]: if parent.span[0] <= longest_error.node.span[0]: break parent = parent.parent ### print parent # move the things up to this level group_fields = {} group_fields['type'] = 'attachment' group_fields['height'] = 'too low' group_fields['from parent'] = ctree.label group_fields['to parent'] = parent.label group_fields['nodes moving'] = [] group_desc = 'attachment too_low %s_instead_of_%s' % (ctree.label, parent.label) for pos in xrange(len(parent.subtrees)): if parent.subtrees[pos].span[1] == ctree.span[1]: for subtree in moving: subtree.parent.subtrees.remove(subtree) parent.subtrees.insert(pos + 1, subtree) pos += 1 subtree.parent = parent group_desc += ' ' + subtree.label group_fields['nodes moving'].append(subtree.label) break group_fields['nodes moving'] = ' '.join(group_fields['nodes moving']) # if only one thing is left behind, and its parent is extra, fix that if len(ctree.subtrees) == 1: for pos in xrange(len(ctree.parent.subtrees)): if ctree.parent.subtrees[pos] == ctree: for subtree in ctree.subtrees[::-1]: ctree.parent.subtrees.insert(pos+1, subtree) subtree.parent = ctree.parent break ctree.parent.subtrees.remove(ctree) to_group.append(error) test_tree.update_span() # if possible, fix longest_error left, right = -1, -1 for pos in xrange(len(parent.subtrees)): if longest_error.node.span[0] == parent.subtrees[pos].span[0]: left = pos if longest_error.node.span[1] == parent.subtrees[pos].span[1]: right = pos if -1 < left < right: repair_tree.repair_missing_node(longest_error, test_tree) to_group.append(longest_error) # other errors that are fixed as a side effect will be found by the cleanup stuff group_desc += ' |ece1' group_fields['ID'] = 'ece1' group_fields['old desc'] = group_desc return group_fields, test_tree