Esempio n. 1
0
    def test_edit_distance_compliment(self):
        '''
        Test a mixed example of additions, substitutions and transpositions.
        '''
        a = [
            set(),
            set(),
            set(),
            set(),
            set(),
            set(),
            set(),
            set(),
            set(),
            set()
        ]
        b = [
            set([1]),
            set([1]),
            set([1]),
            set([1]),
            set([1]),
            set([1]),
            set([1]),
            set([1]),
            set([1]),
            set([1])
        ]

        additions, substitutions, transpositions = boundary_edit_distance(a, b)
        self.assertEqual(([(1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'),
                           (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'),
                           (1, 'b')], [], []),
                         (additions, substitutions, transpositions))
Esempio n. 2
0
    def test_edit_distance(self):
        '''
        Test a mixed example of additions, substitutions and transpositions.
        '''
        a = [
            set(),
            set([1]),
            set(),
            set(),
            set([1]),
            set(),
            set(),
            set(),
            set(),
            set()
        ]
        b = [
            set(),
            set([2, 3]),
            set(),
            set(),
            set(),
            set([1]),
            set(),
            set(),
            set([3]),
            set()
        ]

        additions, substitutions, transpositions = boundary_edit_distance(a, b)
        self.assertEqual(([(3, 'b'), (3, 'b')], [(1, 2)], [(4, 5, 1)]),
                         (additions, substitutions, transpositions))
Esempio n. 3
0
 def test_edit_distance_two_substitutions_one_ad_into_no_transpositions(
         self):
     '''
     Test two transpositions
     '''
     a = [
         set(),
         set(),
         set(),
         set([2]),
         set(),
         set([3]),
         set(),
         set(),
         set(),
         set()
     ]
     b = [
         set(),
         set(),
         set(),
         set([1, 4]),
         set(),
         set([2]),
         set(),
         set(),
         set(),
         set()
     ]
     additions, substitutions, transpositions = boundary_edit_distance(
         a, b, n_t=3)
     self.assertEqual(([(4, 'b')], [(2, 1), (3, 2)], []),
                      (additions, substitutions, transpositions))
Esempio n. 4
0
 def test_edit_distance_three_transpositions_overlapping(self):
     '''
     Test two transpositions
     '''
     a = [
         set(),
         set(),
         set(),
         set([1, 2, 3]),
         set(),
         set(),
         set(),
         set(),
         set(),
         set()
     ]
     b = [
         set(),
         set(),
         set(),
         set(),
         set([3]),
         set([1, 2]),
         set(),
         set(),
         set(),
         set()
     ]
     additions, substitutions, transpositions = boundary_edit_distance(
         a, b, n_t=3)
     self.assertEqual(([], [], [(3, 4, 3), (3, 5, 1), (3, 5, 2)]),
                      (additions, substitutions, transpositions))
Esempio n. 5
0
 def test_edit_distance_two_transpositions_equal(self):
     '''
     Test two transpositions
     '''
     a = [
         set(),
         set(),
         set(),
         set(),
         set([2]),
         set(),
         set([2]),
         set(),
         set(),
         set()
     ]
     b = [
         set(),
         set(),
         set(),
         set(),
         set(),
         set([2]),
         set(),
         set(),
         set(),
         set()
     ]
     additions, substitutions, transpositions = boundary_edit_distance(a, b)
     self.assertEqual(([(2, 'a')], [], [(4, 5, 2)]),
                      (additions, substitutions, transpositions))
 def test_edit_distance_two_substitutions_into_no_transpositions(self):
     '''
     Test two transpositions
     '''
     a = [set(), set(), set(), set([2]), set(), set([3]), set(), set(), set(), set()]
     b = [set(), set(), set(), set([1]), set(), set([2]), set(), set(), set(), set()]
     additions, substitutions, transpositions = boundary_edit_distance(a, b, n_t=3)
     self.assertEqual(([], [(2,1), (3,2)], []),
                      (additions, substitutions, transpositions))
 def test_edit_distance_two_transpositions_equal(self):
     '''
     Test two transpositions
     '''
     a = [set(), set(), set(), set(), set([2]), set(), set([2]), set(), set(), set()]
     b = [set(), set(), set(), set(), set(), set([2]), set(), set(), set(), set()]
     additions, substitutions, transpositions = boundary_edit_distance(a, b)
     self.assertEqual(([(2, 'a')], [], [(4, 5, 2)]),
                      (additions, substitutions, transpositions))
 def test_edit_distance_three_transpositions_overlapping(self):
     '''
     Test two transpositions
     '''
     a = [set(), set(), set(), set([1,2,3]), set(),    set(), set(), set(), set(), set()]
     b = [set(), set(), set(), set(), set([3]), set([1,2]), set(), set(), set(), set()]
     additions, substitutions, transpositions = boundary_edit_distance(a, b, n_t=3)
     self.assertEqual(([], [], [(3,4,3),(3,5,1),(3,5,2)]),
                      (additions, substitutions, transpositions))
Esempio n. 9
0
def __boundary_statistics__(
        segs_a, segs_b, boundary_types, boundary_format, n_t, weight):
    '''
    Compute boundary similarity applying the weighting functions specified.
    '''

    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        segs_a = convert_nltk_to_masses(segs_a)
        segs_b = convert_nltk_to_masses(segs_b)
        boundary_format = BoundaryFormat.mass
    # Check format
    if boundary_format == BoundaryFormat.sets:
        pass  # Correct boundary format
    elif boundary_format == BoundaryFormat.mass:
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    elif boundary_format == BoundaryFormat.position:
        segs_a = convert_positions_to_masses(segs_a)
        segs_b = convert_positions_to_masses(segs_b)
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    else:
        raise SegmentationMetricError('Unsupported boundary format')
    # Check length
    if len(segs_a) != len(segs_b):
        raise SegmentationMetricError(
            'Segmentations differ in length ({0} != {1})'.format(
                len(segs_a), len(segs_b)))
    # Determine the boundary types
    boundary_types = identify_types(segs_a, segs_b)
    # Calculate the total pbs
    pbs = len(segs_b) * len(boundary_types)
    # Compute edits
    additions, substitutions, transpositions = \
        boundary_edit_distance(segs_a, segs_b, n_t=n_t)
    # Apply weighting functions
    fnc_weight_a, fnc_weight_s, fnc_weight_t = weight
    count_additions = fnc_weight_a(additions)
    count_substitutions = fnc_weight_s(substitutions,
                                       max(boundary_types),
                                       min(boundary_types))
    count_transpositions = fnc_weight_t(transpositions, n_t)
    count_edits = count_additions + count_substitutions + count_transpositions
    # Compute
    matches = list()
    full_misses = list()
    boundaries_all = 0
    for set_a, set_b in zip(segs_a, segs_b):
        matches.extend(set_a.intersection(set_b))
        full_misses.extend(set_a.symmetric_difference(set_b))
        boundaries_all += len(set_a) + len(set_b)
    return {'count_edits': count_edits, 'additions': additions,
            'substitutions': substitutions, 'transpositions': transpositions,
            'full_misses': full_misses, 'boundaries_all': boundaries_all,
            'matches': matches, 'pbs': pbs, 'boundary_types': boundary_types}
    def test_edit_distance(self):
        '''
        Test a mixed example of additions, substitutions and transpositions.
        '''
        a = [set(), set([1]), set(), set(), set([1]), set(), set(), set(), set(), set()]
        b = [set(), set([2, 3]), set(), set(), set(), set([1]), set(), set(), set([3]), set()]

        additions, substitutions, transpositions = boundary_edit_distance(a, b)
        self.assertEqual(([(3, 'b'), (3, 'b')], [(1, 2)], [(4, 5, 1)]),
                         (additions, substitutions, transpositions))
    def test_edit_distance_compliment(self):
        '''
        Test a mixed example of additions, substitutions and transpositions.
        '''
        a = [set(), set(), set(), set(), set(), set(), set(), set(), set(), set()]
        b = [set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1]), set([1])]

        additions, substitutions, transpositions = boundary_edit_distance(a, b)
        self.assertEqual(([(1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b'), (1, 'b')], [], []),
                         (additions, substitutions, transpositions))
    def test_edit_distance_identity(self):
        '''
        Test a mixed example of additions, substitutions and transpositions.
        '''
        a = [set(), set([1]), set(), set(), set([1]), set(), set(), set(), set(), set()]
        b = a

        additions, substitutions, transpositions = boundary_edit_distance(a, b)
        self.assertEqual(([], [], []),
                         (additions, substitutions, transpositions))
Esempio n. 13
0
    def test_edit_distance_identity(self):
        '''
        Test a mixed example of additions, substitutions and transpositions.
        '''
        a = [
            set(),
            set([1]),
            set(),
            set(),
            set([1]),
            set(),
            set(),
            set(),
            set(),
            set()
        ]
        b = a

        additions, substitutions, transpositions = boundary_edit_distance(a, b)
        self.assertEqual(([], [], []),
                         (additions, substitutions, transpositions))
Esempio n. 14
0
def __boundary_statistics__(segs_a, segs_b, boundary_types, boundary_format,
                            n_t, weight):
    '''
    Compute boundary similarity applying the weighting functions specified.
    '''

    # Convert from NLTK types
    if boundary_format == BoundaryFormat.nltk:
        segs_a = convert_nltk_to_masses(segs_a)
        segs_b = convert_nltk_to_masses(segs_b)
        boundary_format = BoundaryFormat.mass
    # Check format
    if boundary_format == BoundaryFormat.sets:
        pass  # Correct boundary format
    elif boundary_format == BoundaryFormat.mass:
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    elif boundary_format == BoundaryFormat.position:
        segs_a = convert_positions_to_masses(segs_a)
        segs_b = convert_positions_to_masses(segs_b)
        segs_a = boundary_string_from_masses(segs_a)
        segs_b = boundary_string_from_masses(segs_b)
    else:
        raise SegmentationMetricError('Unsupported boundary format')
    # Check length
    if len(segs_a) != len(segs_b):
        raise SegmentationMetricError(
            'Segmentations differ in length ({0} != {1})'.format(
                len(segs_a), len(segs_b)))
    # Determine the boundary types
    boundary_types = identify_types(segs_a, segs_b)
    # Calculate the total pbs
    pbs = len(segs_b) * len(boundary_types)
    # Compute edits
    additions, substitutions, transpositions = \
        boundary_edit_distance(segs_a, segs_b, n_t=n_t)
    # Apply weighting functions
    fnc_weight_a, fnc_weight_s, fnc_weight_t = weight
    count_additions = fnc_weight_a(additions)
    count_substitutions = fnc_weight_s(substitutions, max(boundary_types),
                                       min(boundary_types))
    count_transpositions = fnc_weight_t(transpositions, n_t)
    count_edits = count_additions + count_substitutions + count_transpositions
    # Compute
    matches = list()
    full_misses = list()
    boundaries_all = 0
    for set_a, set_b in zip(segs_a, segs_b):
        matches.extend(set_a.intersection(set_b))
        full_misses.extend(set_a.symmetric_difference(set_b))
        boundaries_all += len(set_a) + len(set_b)
    return {
        'count_edits': count_edits,
        'additions': additions,
        'substitutions': substitutions,
        'transpositions': transpositions,
        'full_misses': full_misses,
        'boundaries_all': boundaries_all,
        'matches': matches,
        'pbs': pbs,
        'boundary_types': boundary_types
    }