def test_boundary_edit_distance(self): ''' Test boundary_edit_distance. ''' edits = ([(1, 'b'), (1, 'b'), (1, 'b')], [], [(9, 10, 1)]) self.assertEquals(edits, boundary_edit_distance( boundary_string_from_masses(self.masses_an1), boundary_string_from_masses(self.masses_an2)))
def boundary_string_from_boundary_indices(segmentation, doc_length): """converts boundary indices to segeval-compatible boundary strings Args: segmentation (list of int): list of segmentation boundary indices doc_length (int): length of the segmented document Returns: tuple: Boundary string """ i = 1 tokens_in_segment = 0 masses = [] current_seg_index = 0 while i < doc_length: tokens_in_segment += 1 if current_seg_index < len( segmentation) and i > segmentation[current_seg_index] - 1: masses.append(tokens_in_segment) tokens_in_segment = 0 current_seg_index += 1 i += 1 masses.append(doc_length - segmentation[-1]) return segeval.boundary_string_from_masses(tuple(masses))
def test_boundary_string_from_masses(self): ''' Test boundary_string_from_masses. ''' self.assertEquals(( frozenset([]), frozenset([]), frozenset([]), frozenset([]), frozenset([]), frozenset([]), frozenset([]), frozenset([]), frozenset([]), frozenset([]), frozenset([1]), frozenset([])), boundary_string_from_masses(self.masses_an1))