Beispiel #1
0
class TestSegmentAnnotationCmp(object):
    sa1 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(0, 0.5), None)])
    sa2 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(0.5, 1.5), None)])
    sa3 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(1.3, 1.4), None)])
    sa4 = SegmentAnnotation('n2', [FragmentToken('n2', Interval(0, 1), None)])

    def test_invalid_comparison(self):
        with pytest.raises(ValueError):
            annotation_cmp(self.sa1, self.sa4)

    def test_annotation_eq(self):
        assert (annotation_cmp(self.sa1, self.sa1) == 0)
        assert (annotation_cmp(self.sa2, self.sa2) == 0)
        assert (annotation_cmp(self.sa3, self.sa3) == 0)

    def test_annotation_cmp(self):
        assert (annotation_cmp(self.sa1, self.sa2) == -1)
        assert (annotation_cmp(self.sa1, self.sa3) == -1)
        assert (annotation_cmp(self.sa2, self.sa1) == 1)
        assert (annotation_cmp(self.sa3, self.sa1) == 1)

        assert (annotation_cmp(self.sa2, self.sa3) == 0)
        assert (annotation_cmp(self.sa3, self.sa2) == 0)
Beispiel #2
0
 def test_eq_wrong_ntokens(self):
     sa1 = SegmentAnnotation('name1',
                             [FragmentToken('', Interval(0, 2), None)])
     sa2 = SegmentAnnotation('name1', [
         FragmentToken('', Interval(0, 1), None),
         FragmentToken('', Interval(1, 2), None)
     ])
     assert (sa1 != sa2)
class TestCheckTruncateIntervals(object):
    m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]})
    d1 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )})
    d2 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )})
    d3 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )})
    d4 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )})
    sa = [
        SegmentAnnotation('a', [
            FragmentToken('a', Interval(0.0, 0.25), 'a'),
            FragmentToken('a', Interval(0.25, 0.5), 'b'),
            FragmentToken('a', Interval(0.5, 0.75), 'c'),
            FragmentToken('a', Interval(0.75, 1.0), 'd')
        ])
    ]
    ca = Corpus(sa)

    def test_good_interval(self):
        assert (truncate_intervals(self.d1, self.ca,
                                   self.m1) == (self.d1, [], []))

    def test_truncate_interval(self):
        assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({
            ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                             ('c', 'd')), )
        }), [], []))
        assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({
            ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                             ('c', 'd')), )
        }), [], []))
Beispiel #4
0
    def test_read_small(self):
        contents = """f1 0.000 0.100 a
f1 0.100 0.200 r
f1 0.200 0.300 m
f1 0.300 0.400 s
f1 0.400 0.500 a
f1 0.700 0.800 w
f1 0.800 0.900 o
f1 0.900 1.000 r
f1 1.000 1.100 m
f1 1.100 1.200 s
f1 1.200 1.300 a
f2 0.100 0.200 w
f2 0.200 0.300 o
f2 0.300 0.400 r
f2 0.400 0.500 d
f2 0.500 0.600 s
"""
        tokens = [
            FragmentToken('f1', Interval(0.0, 0.1), 'a'),
            FragmentToken('f1', Interval(0.1, 0.2), 'r'),
            FragmentToken('f1', Interval(0.2, 0.3), 'm'),
            FragmentToken('f1', Interval(0.3, 0.4), 's'),
            FragmentToken('f1', Interval(0.4, 0.5), 'a'),
            FragmentToken('f1', Interval(0.7, 0.8), 'w'),
            FragmentToken('f1', Interval(0.8, 0.9), 'o'),
            FragmentToken('f1', Interval(0.9, 1.0), 'r'),
            FragmentToken('f1', Interval(1.0, 1.1), 'm'),
            FragmentToken('f1', Interval(1.1, 1.2), 's'),
            FragmentToken('f1', Interval(1.2, 1.3), 'a'),
            FragmentToken('f2', Interval(0.1, 0.2), 'w'),
            FragmentToken('f2', Interval(0.2, 0.3), 'o'),
            FragmentToken('f2', Interval(0.3, 0.4), 'r'),
            FragmentToken('f2', Interval(0.4, 0.5), 'd'),
            FragmentToken('f2', Interval(0.5, 0.6), 's')
        ]
        corpus = Corpus([
            SegmentAnnotation('f1', tokens[0:5]),
            SegmentAnnotation('f1', tokens[5:11]),
            SegmentAnnotation('f2', tokens[11:])
        ])

        assert ([tokens[0:5], tokens[5:11],
                 tokens[11:]] == read_annotation(contents))
        assert (tokenlists_to_corpus(read_annotation(contents)) == corpus)
Beispiel #5
0
 def test_tokens_at_interval(self):
     assert (self.sa.tokens_at_interval(Interval(0.0, 0.5)) == tuple(
         self.tokenlist))
     assert (self.sa.tokens_at_interval(Interval(0.1, 0.4)) == tuple(
         self.tokenlist[1:4]))
     assert (self.sa.tokens_at_interval(Interval(
         0.0, 0.05)) == (self.tokenlist[0], ))
     assert (self.sa.tokens_at_interval(Interval(10, 11)) == tuple())
     assert (SegmentAnnotation('', []).tokens_at_interval(Interval(
         0, 1)) == tuple())
Beispiel #6
0
 def test_restrict(self):
     s1 = IntervalDB({'a': [(0.0, 0.5)]})
     s2 = IntervalDB({'a': [(0.0, 1.3)]})
     s3 = IntervalDB({'a': [(0.0, 1.0)]})
     assert (self.ca.restrict(s1) == Corpus(self.segment_annotations[:1]))
     assert (self.ca.restrict(s2) == Corpus(self.segment_annotations[:2]))
     assert (self.ca.restrict(s3) == Corpus([
         self.segment_annotations[0],
         SegmentAnnotation('a', self.segment_annotations[1][:3])
     ]))
Beispiel #7
0
def tokenlists_to_corpus(tokenlists):
    """Convert a list of tokens to Corpus object

    Parameters
    ----------
    tokenlists : list of list of FragmentToken

    Returns :
    c : Corpus

    """
    fas = []  # FileAnnotations
    for tokenlist in tokenlists:
        fname = tokenlist[0].name
        fas.append(SegmentAnnotation(fname, tokenlist))
    return Corpus(fas)
Beispiel #8
0
class TestPairwiseSubstringCompletion(object):
    fragments = [FragmentToken('a', Interval(0.0, 0.25), 'a'),
                 FragmentToken('a', Interval(0.25, 0.5), 'b'),
                 FragmentToken('a', Interval(0.5, 0.75), 'c'),
                 FragmentToken('a', Interval(0.75, 1.0), 'd'),
                 FragmentToken('a', Interval(1.0, 1.25), 'e'),

                 FragmentToken('b', Interval(0.0, 0.25), 'a'),
                 FragmentToken('b', Interval(0.25, 0.5), 'b'),
                 FragmentToken('b', Interval(0.5, 0.75), 'c'),
                 FragmentToken('b', Interval(0.75, 1.0), 'd'),
                 FragmentToken('b', Interval(1.0, 1.25), 'e'),

                 FragmentToken('c', Interval(0.0, 0.25), 'f'),
                 FragmentToken('c', Interval(0.25, 0.5), 'g'),
                 FragmentToken('c', Interval(0.5, 0.75), 'h'),
                 FragmentToken('c', Interval(0.75, 1.0), 'i'),
                 FragmentToken('c', Interval(1.0, 1.25), 'j')]
    sa = [SegmentAnnotation('a', fragments[:5]),
          SegmentAnnotation('b', fragments[5:10]),
          SegmentAnnotation('c', fragments[10:])]
    ca = Corpus(sa)
    fragment1 = FragmentToken('a', Interval(0.0, 1.0), None)
    fragment2 = FragmentToken('b', Interval(0.0, 1.0), None)
    fragment3 = FragmentToken('c', Interval(0.0, 1.0), None)
    fragment4 = FragmentToken('b', Interval(0.0, 1.25), None)

    pfragments = [FragmentToken('a', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')),
                  FragmentToken('a', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')),
                  FragmentToken('a', Interval(0.0, 0.75), ('a', 'b', 'c')),
                  FragmentToken('a', Interval(0.25, 1.0), ('b', 'c', 'd')),
                  FragmentToken('a', Interval(0.5, 1.25), ('c', 'd', 'e')),

                  FragmentToken('b', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')),
                  FragmentToken('b', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')),
                  FragmentToken('b', Interval(0.0, 0.75), ('a', 'b', 'c')),
                  FragmentToken('b', Interval(0.25, 1.0), ('b', 'c', 'd')),
                  FragmentToken('b', Interval(0.5, 1.25), ('c', 'd', 'e')),

                  FragmentToken('c', Interval(0.0, 1.0), ('f', 'g', 'h', 'i')),
                  FragmentToken('c', Interval(0.25, 1.25), ('g', 'h', 'i', 'j')),
                  FragmentToken('c', Interval(0.0, 0.75), ('f', 'g', 'h')),
                  FragmentToken('c', Interval(0.25, 1.0), ('g', 'h', 'i')),
                  FragmentToken('c', Interval(0.5, 1.25), ('h', 'i', 'j'))]


    def test_same(self):
        # fragment1 - fragment2
        # abcd - abcd

        # expected:
        # abcd - abcd
        # abc - abc
        # bcd - bcd
        e = set([(self.pfragments[0], self.pfragments[5]),
                 (self.pfragments[2], self.pfragments[7]),
                 (self.pfragments[3], self.pfragments[8])])
        p = set(pairwise_substring_completion(self.fragment1,
                                              self.fragment2,
                                              self.ca, 3, 20))
        assert(p == e)

    def test_different(self):
        # fragment1 - fragment3
        # abcd - fghi

        # expected:
        # abcd - fghi
        # abc - fgh
        # bcd - ghi
        e = set([(self.pfragments[0], self.pfragments[10]),
                 (self.pfragments[2], self.pfragments[12]),
                 (self.pfragments[3], self.pfragments[13])])
        p = set(pairwise_substring_completion(self.fragment1,
                                              self.fragment3,
                                              self.ca, 3, 20))
        assert(e == p)

    def test_longer(self):
        # fragment1 - fragment4
        # abcd - abcde

        # expected:

        # abcd - abcd
        # abc - abc
        # bcd - bcd

        # abcd - bcde
        # abc - bcd
        # bcd - cde
        e = set([(self.pfragments[0], self.pfragments[5]),
                 (self.pfragments[2], self.pfragments[7]),
                 (self.pfragments[3], self.pfragments[8]),

                 (self.pfragments[0], self.pfragments[6]),
                 (self.pfragments[2], self.pfragments[8]),
                 (self.pfragments[3], self.pfragments[9])])
        p = set(pairwise_substring_completion(self.fragment1,
                                              self.fragment4,
                                              self.ca, 3, 20))
        assert (e == p)

    def test_different_and_longer(self):
        # fragment3 - fragment4
        # fghi - abcde

        # expected:
        # fghi - abcd
        # fgh - abc
        # ghi - bcd

        # fghi - bcde
        # fgh - bcd
        # ghi - cde
        e = set([(self.pfragments[10], self.pfragments[5]),
                 (self.pfragments[12], self.pfragments[7]),
                 (self.pfragments[13], self.pfragments[8]),
                 (self.pfragments[10], self.pfragments[6]),
                 (self.pfragments[12], self.pfragments[8]),
                 (self.pfragments[13], self.pfragments[9])])
        p = set(pairwise_substring_completion(self.fragment3,
                                              self.fragment4,
                                              self.ca, 3, 20))
        assert (e == p)
Beispiel #9
0
class TestReadClasses(object):
    tiny_classes = """Class 0
f1 0.000 4.000
f2 0.000 4.000

Class 1
f1 1.000 4.000
f2 1.000 4.000

Class 2
f1 0.000 3.000
f2 0.000 3.000


"""
    tiny_corpus = """f1 0.000 1.000 a
f1 1.000 2.000 b
f1 2.000 3.000 c
f1 3.000 4.000 d
f2 0.000 1.000 a
f2 1.000 2.000 b
f2 2.000 3.000 c
f2 3.000 4.000 d

"""
    clsdict_e = {
        ClassID(0, None):
        (FragmentToken('f1', Interval(0.0, 4.0),
                       None), FragmentToken('f2', Interval(0.0, 4.0), None)),
        ClassID(1, None): (FragmentToken('f1', Interval(1.0, 4.0), None),
                           FragmentToken('f2', Interval(1.0, 4.0), None)),
        ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0), None),
                           FragmentToken('f2', Interval(0.0, 3.0), None))
    }
    clsdict_a = {
        ClassID(0, None):
        (FragmentToken('f1', Interval(0.0, 4.0), ('a', 'b', 'c', 'd')),
         FragmentToken('f2', Interval(0.0, 4.0), ('a', 'b', 'c', 'd'))),
        ClassID(1, None):
        (FragmentToken('f1', Interval(1.0, 4.0), ('b', 'c', 'd')),
         FragmentToken('f2', Interval(1.0, 4.0), ('b', 'c', 'd'))),
        ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0),
                                         ('a', 'b', 'c')),
                           FragmentToken('f2', Interval(0.0, 3.0),
                                         ('a', 'b', 'c')))
    }
    tokens = [
        FragmentToken('f1', Interval(0.0, 1.0), 'a'),
        FragmentToken('f1', Interval(1.0, 2.0), 'b'),
        FragmentToken('f1', Interval(2.0, 3.0), 'c'),
        FragmentToken('f1', Interval(3.0, 4.0), 'd'),
        FragmentToken('f2', Interval(0.0, 1.0), 'a'),
        FragmentToken('f2', Interval(1.0, 2.0), 'b'),
        FragmentToken('f2', Interval(2.0, 3.0), 'c'),
        FragmentToken('f2', Interval(3.0, 4.0), 'd')
    ]
    corpus = Corpus([
        SegmentAnnotation('f1', tokens[:4]),
        SegmentAnnotation('f2', tokens[4:])
    ])

    def test_small(self):
        assert (self.clsdict_e == read_classfile(self.tiny_classes))

    def test_corpus(self):
        assert (self.corpus == tokenlists_to_corpus(
            read_annotation(self.tiny_corpus)))

    def test_annotate(self):
        assert (self.clsdict_a == annotate_classes(
            read_classfile(self.tiny_classes),
            tokenlists_to_corpus(read_annotation(self.tiny_corpus))))
Beispiel #10
0
 def test_different_names(self):
     with pytest.raises(ValueError):
         SegmentAnnotation('', [
             FragmentToken('a', Interval(0, 1), None),
             FragmentToken('b', Interval(1, 2), None)
         ])
Beispiel #11
0
 def test_non_contiguous(self):
     with pytest.raises(ValueError):
         SegmentAnnotation('', [
             FragmentToken('a', Interval(0, 1), None),
             FragmentToken('a', Interval(2, 3), None)
         ])
Beispiel #12
0
 def test_empty(self):
     e = SegmentAnnotation('', [])
     assert (e.name == '')
     assert (e.interval is None)
Beispiel #13
0
class TestSegmentAnnotation(object):
    tokenlist = (FragmentToken('a', Interval(0.0, 0.1), 'a'),
                 FragmentToken('a', Interval(0.1, 0.2),
                               'r'), FragmentToken('a', Interval(0.2, 0.3),
                                                   'm'),
                 FragmentToken('a', Interval(0.3, 0.4),
                               's'), FragmentToken('a', Interval(0.4, 0.5),
                                                   'a'))
    sa = SegmentAnnotation('name1', tokenlist)

    def test_restrict(self):
        db1 = IntervalDB({'a': [Interval(0, 0.5)]})
        db2 = IntervalDB({'a': [Interval(0, 0.3)]})
        assert (self.sa.restrict(db1) == self.sa)
        assert (self.sa.restrict(db2) == SegmentAnnotation(
            'name1', self.tokenlist[:3]))

    def test_len(self):
        assert (len(self.sa) == 5)

    def test_iter(self):
        assert (list(iter(self.sa)) == list(self.tokenlist))

    def test_get_item(self):
        for i in xrange(len(self.tokenlist)):
            assert (self.sa[i] == self.tokenlist[i])

    def test_eq(self):
        assert (self.sa == self.sa)

    def test_eq_wrong_name(self):
        sa1 = SegmentAnnotation('name1', [])
        sa2 = SegmentAnnotation('name2', [])
        assert (sa1 != sa2)

    def test_eq_wrong_interval(self):
        sa1 = SegmentAnnotation('name1',
                                [FragmentToken('', Interval(0, 1), None)])
        sa2 = SegmentAnnotation('name1',
                                [FragmentToken('', Interval(0, 3), None)])
        assert (sa1 != sa2)

    def test_eq_wrong_ntokens(self):
        sa1 = SegmentAnnotation('name1',
                                [FragmentToken('', Interval(0, 2), None)])
        sa2 = SegmentAnnotation('name1', [
            FragmentToken('', Interval(0, 1), None),
            FragmentToken('', Interval(1, 2), None)
        ])
        assert (sa1 != sa2)

    def test_tokens_at_interval(self):
        assert (self.sa.tokens_at_interval(Interval(0.0, 0.5)) == tuple(
            self.tokenlist))
        assert (self.sa.tokens_at_interval(Interval(0.1, 0.4)) == tuple(
            self.tokenlist[1:4]))
        assert (self.sa.tokens_at_interval(Interval(
            0.0, 0.05)) == (self.tokenlist[0], ))
        assert (self.sa.tokens_at_interval(Interval(10, 11)) == tuple())
        assert (SegmentAnnotation('', []).tokens_at_interval(Interval(
            0, 1)) == tuple())

    def test_annotation_at_interval(self):
        assert (self.sa.annotation_at_interval(Interval(0.0, 0.5)) == tuple(
            ['a', 'r', 'm', 's', 'a']))
        assert (self.sa.annotation_at_interval(Interval(0.1, 0.4)) == tuple(
            ['r', 'm', 's']))
        assert (self.sa.annotation_at_interval(Interval(0.0,
                                                        0.05)) == tuple(['a']))
        assert (self.sa.annotation_at_interval(Interval(10, 11)) == tuple())

    def test_empty(self):
        e = SegmentAnnotation('', [])
        assert (e.name == '')
        assert (e.interval is None)

    def test_non_contiguous(self):
        with pytest.raises(ValueError):
            SegmentAnnotation('', [
                FragmentToken('a', Interval(0, 1), None),
                FragmentToken('a', Interval(2, 3), None)
            ])

    def test_different_names(self):
        with pytest.raises(ValueError):
            SegmentAnnotation('', [
                FragmentToken('a', Interval(0, 1), None),
                FragmentToken('b', Interval(1, 2), None)
            ])
Beispiel #14
0
 def test_eq_wrong_name(self):
     sa1 = SegmentAnnotation('name1', [])
     sa2 = SegmentAnnotation('name2', [])
     assert (sa1 != sa2)
Beispiel #15
0
 def test_restrict(self):
     db1 = IntervalDB({'a': [Interval(0, 0.5)]})
     db2 = IntervalDB({'a': [Interval(0, 0.3)]})
     assert (self.sa.restrict(db1) == self.sa)
     assert (self.sa.restrict(db2) == SegmentAnnotation(
         'name1', self.tokenlist[:3]))
Beispiel #16
0
class TestCorpus(object):
    segment_annotations = [
        SegmentAnnotation('a', [
            FragmentToken('a', Interval(0.0, 0.1), 'a'),
            FragmentToken('a', Interval(0.1, 0.2), 'r'),
            FragmentToken('a', Interval(0.2, 0.3), 'm'),
            FragmentToken('a', Interval(0.3, 0.4), 's'),
            FragmentToken('a', Interval(0.4, 0.5), 'a')
        ]),
        SegmentAnnotation('a', [
            FragmentToken('a', Interval(0.7, 0.8), 'w'),
            FragmentToken('a', Interval(0.8, 0.9), 'o'),
            FragmentToken('a', Interval(0.9, 1.0), 'r'),
            FragmentToken('a', Interval(1.0, 1.1), 'm'),
            FragmentToken('a', Interval(1.1, 1.2), 's'),
            FragmentToken('a', Interval(1.2, 1.3), 'a')
        ]),
        SegmentAnnotation('b', [
            FragmentToken('b', Interval(0.1, 0.2), 'w'),
            FragmentToken('b', Interval(0.2, 0.3), 'o'),
            FragmentToken('b', Interval(0.3, 0.4), 'r'),
            FragmentToken('b', Interval(0.4, 0.5), 'd'),
            FragmentToken('b', Interval(0.5, 0.6), 's')
        ])
    ]

    ca = Corpus(segment_annotations)

    def test_len(self):
        assert (len(self.ca) == 2)  #only 2 distinct fname keys

    def test_getitem(self):
        for i in range(len(self.ca)):
            assert (self.ca['b'] == self.ca.segment_annotations['b'])

    def test_ca_intervals(self):
        exp_intervals = {
            'a': [Interval(0.0, 0.5), Interval(0.7, 1.3)],
            'b': [Interval(0.1, 0.6)]
        }
        pred_intervals = {}
        for fname in self.ca.keys():
            intervals = [
                fa.interval for fa in self.ca.segment_annotations[fname]
            ]
            pred_intervals[fname] = intervals
        assert (exp_intervals == pred_intervals)

    def test_annotation_simple(self):
        assert (self.ca.annotation('b', Interval(0.2, 0.5)) == ('o', 'r', 'd'))

    def test_annotation_complex(self):
        assert (self.ca.annotation('a',
                                   Interval(0.7,
                                            1.2)) == ('w', 'o', 'r', 'm', 's'))

    def test_badtoken_fname(self):
        with pytest.raises(KeyError):
            self.ca.tokens('badfilename', 1)

    def test_badtoken_interval(self):
        with pytest.raises(ValueError):
            self.ca.tokens('a', Interval(-10, -5))
        with pytest.raises(ValueError):
            self.ca.tokens('a', Interval(10, 20))

    def test_restrict(self):
        s1 = IntervalDB({'a': [(0.0, 0.5)]})
        s2 = IntervalDB({'a': [(0.0, 1.3)]})
        s3 = IntervalDB({'a': [(0.0, 1.0)]})
        assert (self.ca.restrict(s1) == Corpus(self.segment_annotations[:1]))
        assert (self.ca.restrict(s2) == Corpus(self.segment_annotations[:2]))
        assert (self.ca.restrict(s3) == Corpus([
            self.segment_annotations[0],
            SegmentAnnotation('a', self.segment_annotations[1][:3])
        ]))