class TestSegmentAnnotationCmp(object): sa1 = SegmentAnnotation('n1', [FragmentToken('n1', Interval(0, 0.5), None)]) sa2 = SegmentAnnotation('n1', [FragmentToken('n1', Interval(0.5, 1.5), None)]) sa3 = SegmentAnnotation('n1', [FragmentToken('n1', Interval(1.3, 1.4), None)]) sa4 = SegmentAnnotation('n2', [FragmentToken('n2', Interval(0, 1), None)]) def test_invalid_comparison(self): with pytest.raises(ValueError): annotation_cmp(self.sa1, self.sa4) def test_annotation_eq(self): assert (annotation_cmp(self.sa1, self.sa1) == 0) assert (annotation_cmp(self.sa2, self.sa2) == 0) assert (annotation_cmp(self.sa3, self.sa3) == 0) def test_annotation_cmp(self): assert (annotation_cmp(self.sa1, self.sa2) == -1) assert (annotation_cmp(self.sa1, self.sa3) == -1) assert (annotation_cmp(self.sa2, self.sa1) == 1) assert (annotation_cmp(self.sa3, self.sa1) == 1) assert (annotation_cmp(self.sa2, self.sa3) == 0) assert (annotation_cmp(self.sa3, self.sa2) == 0)
def test_eq_wrong_ntokens(self): sa1 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 2), None)]) sa2 = SegmentAnnotation('name1', [ FragmentToken('', Interval(0, 1), None), FragmentToken('', Interval(1, 2), None) ]) assert (sa1 != sa2)
class TestCheckTruncateIntervals(object): m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]}) d1 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )}) d2 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )}) d3 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )}) d4 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )}) sa = [ SegmentAnnotation('a', [ FragmentToken('a', Interval(0.0, 0.25), 'a'), FragmentToken('a', Interval(0.25, 0.5), 'b'), FragmentToken('a', Interval(0.5, 0.75), 'c'), FragmentToken('a', Interval(0.75, 1.0), 'd') ]) ] ca = Corpus(sa) def test_good_interval(self): assert (truncate_intervals(self.d1, self.ca, self.m1) == (self.d1, [], [])) def test_truncate_interval(self): assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({ ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0), ('c', 'd')), ) }), [], [])) assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({ ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0), ('c', 'd')), ) }), [], []))
def test_read_small(self): contents = """f1 0.000 0.100 a f1 0.100 0.200 r f1 0.200 0.300 m f1 0.300 0.400 s f1 0.400 0.500 a f1 0.700 0.800 w f1 0.800 0.900 o f1 0.900 1.000 r f1 1.000 1.100 m f1 1.100 1.200 s f1 1.200 1.300 a f2 0.100 0.200 w f2 0.200 0.300 o f2 0.300 0.400 r f2 0.400 0.500 d f2 0.500 0.600 s """ tokens = [ FragmentToken('f1', Interval(0.0, 0.1), 'a'), FragmentToken('f1', Interval(0.1, 0.2), 'r'), FragmentToken('f1', Interval(0.2, 0.3), 'm'), FragmentToken('f1', Interval(0.3, 0.4), 's'), FragmentToken('f1', Interval(0.4, 0.5), 'a'), FragmentToken('f1', Interval(0.7, 0.8), 'w'), FragmentToken('f1', Interval(0.8, 0.9), 'o'), FragmentToken('f1', Interval(0.9, 1.0), 'r'), FragmentToken('f1', Interval(1.0, 1.1), 'm'), FragmentToken('f1', Interval(1.1, 1.2), 's'), FragmentToken('f1', Interval(1.2, 1.3), 'a'), FragmentToken('f2', Interval(0.1, 0.2), 'w'), FragmentToken('f2', Interval(0.2, 0.3), 'o'), FragmentToken('f2', Interval(0.3, 0.4), 'r'), FragmentToken('f2', Interval(0.4, 0.5), 'd'), FragmentToken('f2', Interval(0.5, 0.6), 's') ] corpus = Corpus([ SegmentAnnotation('f1', tokens[0:5]), SegmentAnnotation('f1', tokens[5:11]), SegmentAnnotation('f2', tokens[11:]) ]) assert ([tokens[0:5], tokens[5:11], tokens[11:]] == read_annotation(contents)) assert (tokenlists_to_corpus(read_annotation(contents)) == corpus)
def test_tokens_at_interval(self): assert (self.sa.tokens_at_interval(Interval(0.0, 0.5)) == tuple( self.tokenlist)) assert (self.sa.tokens_at_interval(Interval(0.1, 0.4)) == tuple( self.tokenlist[1:4])) assert (self.sa.tokens_at_interval(Interval( 0.0, 0.05)) == (self.tokenlist[0], )) assert (self.sa.tokens_at_interval(Interval(10, 11)) == tuple()) assert (SegmentAnnotation('', []).tokens_at_interval(Interval( 0, 1)) == tuple())
def test_restrict(self): s1 = IntervalDB({'a': [(0.0, 0.5)]}) s2 = IntervalDB({'a': [(0.0, 1.3)]}) s3 = IntervalDB({'a': [(0.0, 1.0)]}) assert (self.ca.restrict(s1) == Corpus(self.segment_annotations[:1])) assert (self.ca.restrict(s2) == Corpus(self.segment_annotations[:2])) assert (self.ca.restrict(s3) == Corpus([ self.segment_annotations[0], SegmentAnnotation('a', self.segment_annotations[1][:3]) ]))
def tokenlists_to_corpus(tokenlists): """Convert a list of tokens to Corpus object Parameters ---------- tokenlists : list of list of FragmentToken Returns : c : Corpus """ fas = [] # FileAnnotations for tokenlist in tokenlists: fname = tokenlist[0].name fas.append(SegmentAnnotation(fname, tokenlist)) return Corpus(fas)
class TestPairwiseSubstringCompletion(object): fragments = [FragmentToken('a', Interval(0.0, 0.25), 'a'), FragmentToken('a', Interval(0.25, 0.5), 'b'), FragmentToken('a', Interval(0.5, 0.75), 'c'), FragmentToken('a', Interval(0.75, 1.0), 'd'), FragmentToken('a', Interval(1.0, 1.25), 'e'), FragmentToken('b', Interval(0.0, 0.25), 'a'), FragmentToken('b', Interval(0.25, 0.5), 'b'), FragmentToken('b', Interval(0.5, 0.75), 'c'), FragmentToken('b', Interval(0.75, 1.0), 'd'), FragmentToken('b', Interval(1.0, 1.25), 'e'), FragmentToken('c', Interval(0.0, 0.25), 'f'), FragmentToken('c', Interval(0.25, 0.5), 'g'), FragmentToken('c', Interval(0.5, 0.75), 'h'), FragmentToken('c', Interval(0.75, 1.0), 'i'), FragmentToken('c', Interval(1.0, 1.25), 'j')] sa = [SegmentAnnotation('a', fragments[:5]), SegmentAnnotation('b', fragments[5:10]), SegmentAnnotation('c', fragments[10:])] ca = Corpus(sa) fragment1 = FragmentToken('a', Interval(0.0, 1.0), None) fragment2 = FragmentToken('b', Interval(0.0, 1.0), None) fragment3 = FragmentToken('c', Interval(0.0, 1.0), None) fragment4 = FragmentToken('b', Interval(0.0, 1.25), None) pfragments = [FragmentToken('a', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')), FragmentToken('a', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')), FragmentToken('a', Interval(0.0, 0.75), ('a', 'b', 'c')), FragmentToken('a', Interval(0.25, 1.0), ('b', 'c', 'd')), FragmentToken('a', Interval(0.5, 1.25), ('c', 'd', 'e')), FragmentToken('b', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')), FragmentToken('b', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')), FragmentToken('b', Interval(0.0, 0.75), ('a', 'b', 'c')), FragmentToken('b', Interval(0.25, 1.0), ('b', 'c', 'd')), FragmentToken('b', Interval(0.5, 1.25), ('c', 'd', 'e')), FragmentToken('c', Interval(0.0, 1.0), ('f', 'g', 'h', 'i')), FragmentToken('c', Interval(0.25, 1.25), ('g', 'h', 'i', 'j')), FragmentToken('c', Interval(0.0, 0.75), ('f', 'g', 'h')), FragmentToken('c', Interval(0.25, 1.0), ('g', 'h', 'i')), FragmentToken('c', Interval(0.5, 1.25), ('h', 'i', 'j'))] def test_same(self): # fragment1 - fragment2 # abcd - abcd # expected: # abcd - abcd # abc - abc # bcd - bcd e = set([(self.pfragments[0], self.pfragments[5]), (self.pfragments[2], self.pfragments[7]), (self.pfragments[3], self.pfragments[8])]) p = set(pairwise_substring_completion(self.fragment1, self.fragment2, self.ca, 3, 20)) assert(p == e) def test_different(self): # fragment1 - fragment3 # abcd - fghi # expected: # abcd - fghi # abc - fgh # bcd - ghi e = set([(self.pfragments[0], self.pfragments[10]), (self.pfragments[2], self.pfragments[12]), (self.pfragments[3], self.pfragments[13])]) p = set(pairwise_substring_completion(self.fragment1, self.fragment3, self.ca, 3, 20)) assert(e == p) def test_longer(self): # fragment1 - fragment4 # abcd - abcde # expected: # abcd - abcd # abc - abc # bcd - bcd # abcd - bcde # abc - bcd # bcd - cde e = set([(self.pfragments[0], self.pfragments[5]), (self.pfragments[2], self.pfragments[7]), (self.pfragments[3], self.pfragments[8]), (self.pfragments[0], self.pfragments[6]), (self.pfragments[2], self.pfragments[8]), (self.pfragments[3], self.pfragments[9])]) p = set(pairwise_substring_completion(self.fragment1, self.fragment4, self.ca, 3, 20)) assert (e == p) def test_different_and_longer(self): # fragment3 - fragment4 # fghi - abcde # expected: # fghi - abcd # fgh - abc # ghi - bcd # fghi - bcde # fgh - bcd # ghi - cde e = set([(self.pfragments[10], self.pfragments[5]), (self.pfragments[12], self.pfragments[7]), (self.pfragments[13], self.pfragments[8]), (self.pfragments[10], self.pfragments[6]), (self.pfragments[12], self.pfragments[8]), (self.pfragments[13], self.pfragments[9])]) p = set(pairwise_substring_completion(self.fragment3, self.fragment4, self.ca, 3, 20)) assert (e == p)
class TestReadClasses(object): tiny_classes = """Class 0 f1 0.000 4.000 f2 0.000 4.000 Class 1 f1 1.000 4.000 f2 1.000 4.000 Class 2 f1 0.000 3.000 f2 0.000 3.000 """ tiny_corpus = """f1 0.000 1.000 a f1 1.000 2.000 b f1 2.000 3.000 c f1 3.000 4.000 d f2 0.000 1.000 a f2 1.000 2.000 b f2 2.000 3.000 c f2 3.000 4.000 d """ clsdict_e = { ClassID(0, None): (FragmentToken('f1', Interval(0.0, 4.0), None), FragmentToken('f2', Interval(0.0, 4.0), None)), ClassID(1, None): (FragmentToken('f1', Interval(1.0, 4.0), None), FragmentToken('f2', Interval(1.0, 4.0), None)), ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0), None), FragmentToken('f2', Interval(0.0, 3.0), None)) } clsdict_a = { ClassID(0, None): (FragmentToken('f1', Interval(0.0, 4.0), ('a', 'b', 'c', 'd')), FragmentToken('f2', Interval(0.0, 4.0), ('a', 'b', 'c', 'd'))), ClassID(1, None): (FragmentToken('f1', Interval(1.0, 4.0), ('b', 'c', 'd')), FragmentToken('f2', Interval(1.0, 4.0), ('b', 'c', 'd'))), ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0), ('a', 'b', 'c')), FragmentToken('f2', Interval(0.0, 3.0), ('a', 'b', 'c'))) } tokens = [ FragmentToken('f1', Interval(0.0, 1.0), 'a'), FragmentToken('f1', Interval(1.0, 2.0), 'b'), FragmentToken('f1', Interval(2.0, 3.0), 'c'), FragmentToken('f1', Interval(3.0, 4.0), 'd'), FragmentToken('f2', Interval(0.0, 1.0), 'a'), FragmentToken('f2', Interval(1.0, 2.0), 'b'), FragmentToken('f2', Interval(2.0, 3.0), 'c'), FragmentToken('f2', Interval(3.0, 4.0), 'd') ] corpus = Corpus([ SegmentAnnotation('f1', tokens[:4]), SegmentAnnotation('f2', tokens[4:]) ]) def test_small(self): assert (self.clsdict_e == read_classfile(self.tiny_classes)) def test_corpus(self): assert (self.corpus == tokenlists_to_corpus( read_annotation(self.tiny_corpus))) def test_annotate(self): assert (self.clsdict_a == annotate_classes( read_classfile(self.tiny_classes), tokenlists_to_corpus(read_annotation(self.tiny_corpus))))
def test_different_names(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('b', Interval(1, 2), None) ])
def test_non_contiguous(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('a', Interval(2, 3), None) ])
def test_empty(self): e = SegmentAnnotation('', []) assert (e.name == '') assert (e.interval is None)
class TestSegmentAnnotation(object): tokenlist = (FragmentToken('a', Interval(0.0, 0.1), 'a'), FragmentToken('a', Interval(0.1, 0.2), 'r'), FragmentToken('a', Interval(0.2, 0.3), 'm'), FragmentToken('a', Interval(0.3, 0.4), 's'), FragmentToken('a', Interval(0.4, 0.5), 'a')) sa = SegmentAnnotation('name1', tokenlist) def test_restrict(self): db1 = IntervalDB({'a': [Interval(0, 0.5)]}) db2 = IntervalDB({'a': [Interval(0, 0.3)]}) assert (self.sa.restrict(db1) == self.sa) assert (self.sa.restrict(db2) == SegmentAnnotation( 'name1', self.tokenlist[:3])) def test_len(self): assert (len(self.sa) == 5) def test_iter(self): assert (list(iter(self.sa)) == list(self.tokenlist)) def test_get_item(self): for i in xrange(len(self.tokenlist)): assert (self.sa[i] == self.tokenlist[i]) def test_eq(self): assert (self.sa == self.sa) def test_eq_wrong_name(self): sa1 = SegmentAnnotation('name1', []) sa2 = SegmentAnnotation('name2', []) assert (sa1 != sa2) def test_eq_wrong_interval(self): sa1 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 1), None)]) sa2 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 3), None)]) assert (sa1 != sa2) def test_eq_wrong_ntokens(self): sa1 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 2), None)]) sa2 = SegmentAnnotation('name1', [ FragmentToken('', Interval(0, 1), None), FragmentToken('', Interval(1, 2), None) ]) assert (sa1 != sa2) def test_tokens_at_interval(self): assert (self.sa.tokens_at_interval(Interval(0.0, 0.5)) == tuple( self.tokenlist)) assert (self.sa.tokens_at_interval(Interval(0.1, 0.4)) == tuple( self.tokenlist[1:4])) assert (self.sa.tokens_at_interval(Interval( 0.0, 0.05)) == (self.tokenlist[0], )) assert (self.sa.tokens_at_interval(Interval(10, 11)) == tuple()) assert (SegmentAnnotation('', []).tokens_at_interval(Interval( 0, 1)) == tuple()) def test_annotation_at_interval(self): assert (self.sa.annotation_at_interval(Interval(0.0, 0.5)) == tuple( ['a', 'r', 'm', 's', 'a'])) assert (self.sa.annotation_at_interval(Interval(0.1, 0.4)) == tuple( ['r', 'm', 's'])) assert (self.sa.annotation_at_interval(Interval(0.0, 0.05)) == tuple(['a'])) assert (self.sa.annotation_at_interval(Interval(10, 11)) == tuple()) def test_empty(self): e = SegmentAnnotation('', []) assert (e.name == '') assert (e.interval is None) def test_non_contiguous(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('a', Interval(2, 3), None) ]) def test_different_names(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('b', Interval(1, 2), None) ])
def test_eq_wrong_name(self): sa1 = SegmentAnnotation('name1', []) sa2 = SegmentAnnotation('name2', []) assert (sa1 != sa2)
def test_restrict(self): db1 = IntervalDB({'a': [Interval(0, 0.5)]}) db2 = IntervalDB({'a': [Interval(0, 0.3)]}) assert (self.sa.restrict(db1) == self.sa) assert (self.sa.restrict(db2) == SegmentAnnotation( 'name1', self.tokenlist[:3]))
class TestCorpus(object): segment_annotations = [ SegmentAnnotation('a', [ FragmentToken('a', Interval(0.0, 0.1), 'a'), FragmentToken('a', Interval(0.1, 0.2), 'r'), FragmentToken('a', Interval(0.2, 0.3), 'm'), FragmentToken('a', Interval(0.3, 0.4), 's'), FragmentToken('a', Interval(0.4, 0.5), 'a') ]), SegmentAnnotation('a', [ FragmentToken('a', Interval(0.7, 0.8), 'w'), FragmentToken('a', Interval(0.8, 0.9), 'o'), FragmentToken('a', Interval(0.9, 1.0), 'r'), FragmentToken('a', Interval(1.0, 1.1), 'm'), FragmentToken('a', Interval(1.1, 1.2), 's'), FragmentToken('a', Interval(1.2, 1.3), 'a') ]), SegmentAnnotation('b', [ FragmentToken('b', Interval(0.1, 0.2), 'w'), FragmentToken('b', Interval(0.2, 0.3), 'o'), FragmentToken('b', Interval(0.3, 0.4), 'r'), FragmentToken('b', Interval(0.4, 0.5), 'd'), FragmentToken('b', Interval(0.5, 0.6), 's') ]) ] ca = Corpus(segment_annotations) def test_len(self): assert (len(self.ca) == 2) #only 2 distinct fname keys def test_getitem(self): for i in range(len(self.ca)): assert (self.ca['b'] == self.ca.segment_annotations['b']) def test_ca_intervals(self): exp_intervals = { 'a': [Interval(0.0, 0.5), Interval(0.7, 1.3)], 'b': [Interval(0.1, 0.6)] } pred_intervals = {} for fname in self.ca.keys(): intervals = [ fa.interval for fa in self.ca.segment_annotations[fname] ] pred_intervals[fname] = intervals assert (exp_intervals == pred_intervals) def test_annotation_simple(self): assert (self.ca.annotation('b', Interval(0.2, 0.5)) == ('o', 'r', 'd')) def test_annotation_complex(self): assert (self.ca.annotation('a', Interval(0.7, 1.2)) == ('w', 'o', 'r', 'm', 's')) def test_badtoken_fname(self): with pytest.raises(KeyError): self.ca.tokens('badfilename', 1) def test_badtoken_interval(self): with pytest.raises(ValueError): self.ca.tokens('a', Interval(-10, -5)) with pytest.raises(ValueError): self.ca.tokens('a', Interval(10, 20)) def test_restrict(self): s1 = IntervalDB({'a': [(0.0, 0.5)]}) s2 = IntervalDB({'a': [(0.0, 1.3)]}) s3 = IntervalDB({'a': [(0.0, 1.0)]}) assert (self.ca.restrict(s1) == Corpus(self.segment_annotations[:1])) assert (self.ca.restrict(s2) == Corpus(self.segment_annotations[:2])) assert (self.ca.restrict(s3) == Corpus([ self.segment_annotations[0], SegmentAnnotation('a', self.segment_annotations[1][:3]) ]))