class TestSegmentAnnotationCmp(object): sa1 = SegmentAnnotation('n1', [FragmentToken('n1', Interval(0, 0.5), None)]) sa2 = SegmentAnnotation('n1', [FragmentToken('n1', Interval(0.5, 1.5), None)]) sa3 = SegmentAnnotation('n1', [FragmentToken('n1', Interval(1.3, 1.4), None)]) sa4 = SegmentAnnotation('n2', [FragmentToken('n2', Interval(0, 1), None)]) def test_invalid_comparison(self): with pytest.raises(ValueError): annotation_cmp(self.sa1, self.sa4) def test_annotation_eq(self): assert (annotation_cmp(self.sa1, self.sa1) == 0) assert (annotation_cmp(self.sa2, self.sa2) == 0) assert (annotation_cmp(self.sa3, self.sa3) == 0) def test_annotation_cmp(self): assert (annotation_cmp(self.sa1, self.sa2) == -1) assert (annotation_cmp(self.sa1, self.sa3) == -1) assert (annotation_cmp(self.sa2, self.sa1) == 1) assert (annotation_cmp(self.sa3, self.sa1) == 1) assert (annotation_cmp(self.sa2, self.sa3) == 0) assert (annotation_cmp(self.sa3, self.sa2) == 0)
def test_interval_errors(self): assert (check_intervals(self.d2, self.m1) == ([ FragmentToken('a', Interval(0.5, 1.5), 'm1') ], [])) assert (check_intervals(self.d4, self.m1) == ([ FragmentToken('a', Interval(0.5, 2.5), 'm1') ], []))
def test_eq_wrong_ntokens(self): sa1 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 2), None)]) sa2 = SegmentAnnotation('name1', [ FragmentToken('', Interval(0, 1), None), FragmentToken('', Interval(1, 2), None) ]) assert (sa1 != sa2)
def test_truncate_interval(self): assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({ ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0), ('c', 'd')), ) }), [], [])) assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({ ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0), ('c', 'd')), ) }), [], []))
def test_typeset(): pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n1)), FragmentToken(None, Interval(0, 1), 'n{0}'.format(n2))) for n1, n2 in zip(xrange(10), xrange(10, 20))] assert (set(list( typeset(pairs))) == set(['m{0}'.format(n) for n in xrange(10)] + ['n{0}'.format(n) for n in xrange(10, 20)])) pairs = [] assert (set(list(typeset(pairs))) == set()) pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)), FragmentToken(None, Interval(0, 1), 'm{0}'.format(n))) for n in xrange(10)] assert (set(typeset(pairs)) == set('m{0}'.format(n) for n in xrange(10)))
class TestCheckTruncateIntervals(object): m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]}) d1 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )}) d2 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )}) d3 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )}) d4 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )}) sa = [ SegmentAnnotation('a', [ FragmentToken('a', Interval(0.0, 0.25), 'a'), FragmentToken('a', Interval(0.25, 0.5), 'b'), FragmentToken('a', Interval(0.5, 0.75), 'c'), FragmentToken('a', Interval(0.75, 1.0), 'd') ]) ] ca = Corpus(sa) def test_good_interval(self): assert (truncate_intervals(self.d1, self.ca, self.m1) == (self.d1, [], [])) def test_truncate_interval(self): assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({ ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0), ('c', 'd')), ) }), [], [])) assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({ ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0), ('c', 'd')), ) }), [], []))
def pairwise_substring_completion(fragment1, fragment2, corpus, minlength, maxlength): name1, name2 = fragment1.name, fragment2.name tokenseq1 = [(f.mark, f.interval) for f in corpus.tokens(name1, fragment1.interval)] tokenseq2 = [(f.mark, f.interval) for f in corpus.tokens(name2, fragment2.interval)] for seq1, seq2 in psubstrings(tokenseq1, tokenseq2, minlength, maxlength): submark1, intervalseq1 = zip(*seq1) submark2, intervalseq2 = zip(*seq2) interval1 = Interval(intervalseq1[0].start, intervalseq1[-1].end) interval2 = Interval(intervalseq2[0].start, intervalseq2[-1].end) yield (FragmentToken(name1, interval1, submark1), FragmentToken(name2, interval2, submark2))
def test_freqs(): pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n1)), FragmentToken(None, Interval(0, 1), 'n{0}'.format(n2))) for n1, n2 in zip(xrange(10), xrange(10, 20))] assert (freqs(pairs) == dict({'m{0}'.format(n): 1 for n in xrange(10)}.items() + {'n{0}'.format(n): 1 for n in xrange(10, 20)}.items())) pairs = [] assert (freqs(pairs) == dict()) pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)), FragmentToken(None, Interval(0, 1), 'm{0}'.format(n))) for n in xrange(10)] assert (freqs(pairs) == {'m{0}'.format(n): 1 for n in xrange(10)})
def annotate_classes(clsdict, corpus, split=None): new = {} # with annotation errors = [] check_split = not (split is None) for classID, tokenlist in clsdict.iteritems(): newtokens = [] for token in tokenlist: filename = token.name interval = token.interval if check_split and not split.is_covered(filename, interval): errors.append(token) try: finterval = split.largest_overlap(filename, interval) qstart, qend = interval fstart, fend = finterval if fstart != qstart or fstart != qend: newstart = max(fstart, qstart) newend = min(fend, qend) interval = Interval(newstart, newend) except KeyError: continue except ValueError: continue try: annot = tuple(corpus.annotation(filename, interval)) except: continue newtokens.append(FragmentToken(filename, interval, annot)) if len(newtokens) > 0: newtokens = tuple(newtokens) new[classID] = newtokens return ClassDict(new), errors
def truncate_intervals(clsdict, corpus, mapping): disc = {} interval_errors = [] filename_errors = [] for class_id in clsdict: fragments = [] for fragment in clsdict[class_id]: qname = fragment.name qstart = fragment.interval.start qend = fragment.interval.end try: finterval = mapping.largest_overlap(qname, fragment.interval) except KeyError: filename_errors.append(fragment.name) continue except ValueError: interval_errors.append(fragment) fstart, fend = finterval if qstart != fstart or qend != fend: newstart = max(qstart, fstart) newend = min(qend, fend) newinterval = Interval(newstart, newend) newmark = corpus.annotation(qname, newinterval) fragment = FragmentToken(qname, newinterval, newmark) fragments.append(fragment) disc[class_id] = tuple(fragments) return ClassDict(disc), filename_errors, interval_errors
def load_alignment(fname, strip_tags=True): """Loads a .ctm alignment file into FragmentTokens.""" fragment_lists = [] fragments = [] previous_name = "" for line in open(fname): name, _, start, duration, mark = line.strip().split(' ') if name != previous_name: if fragments != []: fragment_lists.append(fragments) fragments = [] previous_name = name start = round(float(start), 2) stop = start + round(float(duration), 2) interval = Interval(start, stop) if "phone" in fname and strip_tags: mark = mark.split('_')[0] fragment = FragmentToken(name, interval, mark) fragments.append(fragment) if fragments != []: fragment_lists.append(fragments) # Phone and word alignments aren't necessarily in the same order, so sort. fragment_lists.sort() return fragment_lists
def tokens_at_interval(self, interval): """ Get the annotation tokens corresponding to an interval. Parameters ---------- interval : Interval Returns ------- tuple of FragmentTokens FragmentTokens covered by the interval. """ if len(self.tokens) > 0: name = self.tokens[0].name else: return tuple() dummy_token = FragmentToken(name, interval, None) try: start = self.tokens.index_ge(dummy_token) except ValueError: return tuple() try: stop = self.tokens.index_gt(dummy_token) except ValueError: stop = len(self.tokens) return tuple([x for x in self.tokens[start:stop]])
def tokens(self, name, interval): """ Find the FragmentTokens covering an interval. Parameters ---------- name : string Identifier. interval : Interval Time segment. Returns ------- list of tokens FragmentTokens covered by the interval. """ key = (name, interval) if not key in self._cache: try: fa_for_filename = self[name] except KeyError: raise KeyError('no such name: {0}'.format(name)) dummy_token = FragmentToken(name, interval, None) try: fa = fa_for_filename.find_le(dummy_token) except ValueError: raise ValueError('interval not found: {0}'.format(str(interval))) if (fa.interval.overlap(interval)) > 0: self._cache[key] = fa.tokens_at_interval(interval) else: raise ValueError('interval not found: {0}'.format(str(interval))) return self._cache[key]
def load_annot(fname): fs = [] bname = path.splitext(path.basename(fname))[0] for line in open(fname): start, stop, mark = line.strip().split(' ') interval = Interval(round(float(start), 2), round(float(stop), 2)) fragment = FragmentToken(bname, interval, mark) fs.append(fragment) return fs
class TestFragmentType(object): tokens = [ FragmentToken('a', Interval(0.0, 0.1), 'a'), FragmentToken('a', Interval(0.1, 0.2), 'r'), FragmentToken('a', Interval(0.2, 0.3), 'm'), FragmentToken('a', Interval(0.3, 0.4), 's'), FragmentToken('a', Interval(0.4, 0.5), 'a') ] def test_mark(self): ft = FragmentType(self.tokens, 'markymark') assert (ft.tokens == self.tokens) assert (ft.mark == 'markymark') def test_no_mark(self): ft = FragmentType(self.tokens, None) assert (ft.tokens == self.tokens) assert (ft.mark is None)
class TestTokenCmp(object): f1 = FragmentToken('a', Interval(0.0, 0.5), None) f2 = FragmentToken('a', Interval(0.5, 1.5), None) f3 = FragmentToken('a', Interval(1.3, 1.4), None) f4 = FragmentToken('b', Interval(0, 1), None) def test_invalid_comparison(self): with pytest.raises(ValueError): token_cmp(self.f1, self.f4) def test_token_eq(self): assert (token_cmp(self.f1, self.f1) == 0) assert (token_cmp(self.f2, self.f2) == 0) assert (token_cmp(self.f3, self.f3) == 0) def test_token_cmp(self): assert (token_cmp(self.f1, self.f2) == -1) assert (token_cmp(self.f1, self.f3) == -1) assert (token_cmp(self.f2, self.f1) == 1) assert (token_cmp(self.f3, self.f1) == 1) assert (token_cmp(self.f2, self.f3) == 0) assert (token_cmp(self.f3, self.f2) == 0)
def load_match_file(match_fn, phn_corpus): with open(match_fn) as f: matches = [] for line in f: # if len(matches) > 5000: # break fields = line.strip().split() if len(fields) == 2: base1, base2 = fields elif len(fields) == 6: dtw = float(fields[4]) start1, end1, start2, end2 = map( lambda x: float(x) / 100.0, fields[:4]) interval1 = Interval(start1, end1) interval2 = Interval(start2, end2) fragment1 = FragmentToken( base1, interval1, phn_corpus.annotation(base1, interval1)) fragment2 = FragmentToken( base2, interval2, phn_corpus.annotation(base2, interval2)) matches.append(Match(fragment1, fragment2, dtw)) random.shuffle(matches) return matches[:100000]
def extract_single(tokens1, tokens2, minlength, maxlength, same): """Extract gold alignments between two phone lists. Parameters ---------- tokens1, tokens2 : list of FragmentTokens minlength : int Minimum number of symbols in a fragment same : boolean Whether `tokens1` and `tokens2` are identical. Returns ------- l : list of (FragmentToken, FragmentToken) List of token pairs containing the cooccurring fragments """ ids1, intervals1, phones1 = zip(*tokens1) ids2, intervals2, phones2 = zip(*tokens2) id1 = ids1[0] # ids are all the same id2 = ids2[0] css = allcommonsubstrings(phones1, phones2, minlength=minlength, maxlength=maxlength, same=same) if css is None: return [] r = [] for slice1, slice2 in css: r.append((FragmentToken(id1, Interval(intervals1[slice1.start].start, intervals1[slice1.stop - 1].end), phones1[slice1]), FragmentToken(id2, Interval(intervals2[slice2.start].start, intervals2[slice2.stop - 1].end), phones2[slice2]))) return r
class TestCheckIntervals(object): m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]}) d1 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )}) d2 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )}) d3 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )}) d4 = ClassDict( {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )}) def test_good_interval(self): assert (check_intervals(self.d1, self.m1) == ([], [])) def test_interval_errors(self): assert (check_intervals(self.d2, self.m1) == ([ FragmentToken('a', Interval(0.5, 1.5), 'm1') ], [])) assert (check_intervals(self.d4, self.m1) == ([ FragmentToken('a', Interval(0.5, 2.5), 'm1') ], [])) def test_bad_filename(self): assert (check_intervals(self.d3, self.m1) == ([], ['b']))
def read_classfile(contents): """Read in class file. Parameters ---------- contents : string Returns ------- r : dict from ClassID to list of FragmentToken """ classp = re.compile(r"^Class (?P<classID>\d+)(?: (?P<mark>.+))?$") r = {} curr = [] # list of FragmentTokens without mark curr_class = None for lineno, line in enumerate(contents.split('\n')): m = re.match(classp, line) if m: # on a line with a class label if curr_class is None: curr_class = ClassID(int(m.group('classID')), m.group('mark')) else: raise ValueError('new class while reading class') else: # on an interval line or a whitespace line if len(line.strip()) > 0: split = line.strip().split(' ') name = split[0] start = float(split[1]) end = float(split[2]) interval = Interval(start, end) curr.append(FragmentToken(name, interval, None)) else: # whitespace line, reset if curr_class is None: continue # if lineno == 0: # continue # print lineno, line # raise ValueError('attempting to end reading class ' # 'while not reading class in line {0}' # .format(lineno)) r[curr_class] = tuple(curr) curr = [] curr_class = None if not curr_class is None: r[curr_class] = tuple(curr) return r
def read_annotation(contents): ID_prev = None interval_prev = None r = [] tokenlist_curr = [] for line_idx, line in enumerate(contents.split('\n')): if line == '': continue try: ID_curr, start, stop, mark = line.strip().split(' ') except ValueError: raise ReadError('badly formatted line {1}: {0}'.format( line, line_idx)) try: start = float(start) stop = float(stop) except ValueError: raise ReadError( 'could not convert string to float in line {1}: {0}'.format( line, line_idx)) try: interval_curr = Interval(start, stop) except ValueError: raise ReadError( 'invalid interval in line {0}: ({1:.3f} {2:.3f})'.format( line_idx, start, stop)) token = FragmentToken(ID_curr, interval_curr, mark) if ID_prev is None: tokenlist_curr = [token] ID_prev = ID_curr elif ID_prev == ID_curr: if interval_prev.is_left_adjacent_to(interval_curr): tokenlist_curr.append(token) else: r.append(tokenlist_curr) tokenlist_curr = [token] else: # ID_prev != ID_curr r.append(tokenlist_curr) tokenlist_curr = [token] ID_prev = ID_curr interval_prev = interval_curr r.append(tokenlist_curr) return r
class TestPairwiseSubstringCompletion(object): fragments = [FragmentToken('a', Interval(0.0, 0.25), 'a'), FragmentToken('a', Interval(0.25, 0.5), 'b'), FragmentToken('a', Interval(0.5, 0.75), 'c'), FragmentToken('a', Interval(0.75, 1.0), 'd'), FragmentToken('a', Interval(1.0, 1.25), 'e'), FragmentToken('b', Interval(0.0, 0.25), 'a'), FragmentToken('b', Interval(0.25, 0.5), 'b'), FragmentToken('b', Interval(0.5, 0.75), 'c'), FragmentToken('b', Interval(0.75, 1.0), 'd'), FragmentToken('b', Interval(1.0, 1.25), 'e'), FragmentToken('c', Interval(0.0, 0.25), 'f'), FragmentToken('c', Interval(0.25, 0.5), 'g'), FragmentToken('c', Interval(0.5, 0.75), 'h'), FragmentToken('c', Interval(0.75, 1.0), 'i'), FragmentToken('c', Interval(1.0, 1.25), 'j')] sa = [SegmentAnnotation('a', fragments[:5]), SegmentAnnotation('b', fragments[5:10]), SegmentAnnotation('c', fragments[10:])] ca = Corpus(sa) fragment1 = FragmentToken('a', Interval(0.0, 1.0), None) fragment2 = FragmentToken('b', Interval(0.0, 1.0), None) fragment3 = FragmentToken('c', Interval(0.0, 1.0), None) fragment4 = FragmentToken('b', Interval(0.0, 1.25), None) pfragments = [FragmentToken('a', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')), FragmentToken('a', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')), FragmentToken('a', Interval(0.0, 0.75), ('a', 'b', 'c')), FragmentToken('a', Interval(0.25, 1.0), ('b', 'c', 'd')), FragmentToken('a', Interval(0.5, 1.25), ('c', 'd', 'e')), FragmentToken('b', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')), FragmentToken('b', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')), FragmentToken('b', Interval(0.0, 0.75), ('a', 'b', 'c')), FragmentToken('b', Interval(0.25, 1.0), ('b', 'c', 'd')), FragmentToken('b', Interval(0.5, 1.25), ('c', 'd', 'e')), FragmentToken('c', Interval(0.0, 1.0), ('f', 'g', 'h', 'i')), FragmentToken('c', Interval(0.25, 1.25), ('g', 'h', 'i', 'j')), FragmentToken('c', Interval(0.0, 0.75), ('f', 'g', 'h')), FragmentToken('c', Interval(0.25, 1.0), ('g', 'h', 'i')), FragmentToken('c', Interval(0.5, 1.25), ('h', 'i', 'j'))] def test_same(self): # fragment1 - fragment2 # abcd - abcd # expected: # abcd - abcd # abc - abc # bcd - bcd e = set([(self.pfragments[0], self.pfragments[5]), (self.pfragments[2], self.pfragments[7]), (self.pfragments[3], self.pfragments[8])]) p = set(pairwise_substring_completion(self.fragment1, self.fragment2, self.ca, 3, 20)) assert(p == e) def test_different(self): # fragment1 - fragment3 # abcd - fghi # expected: # abcd - fghi # abc - fgh # bcd - ghi e = set([(self.pfragments[0], self.pfragments[10]), (self.pfragments[2], self.pfragments[12]), (self.pfragments[3], self.pfragments[13])]) p = set(pairwise_substring_completion(self.fragment1, self.fragment3, self.ca, 3, 20)) assert(e == p) def test_longer(self): # fragment1 - fragment4 # abcd - abcde # expected: # abcd - abcd # abc - abc # bcd - bcd # abcd - bcde # abc - bcd # bcd - cde e = set([(self.pfragments[0], self.pfragments[5]), (self.pfragments[2], self.pfragments[7]), (self.pfragments[3], self.pfragments[8]), (self.pfragments[0], self.pfragments[6]), (self.pfragments[2], self.pfragments[8]), (self.pfragments[3], self.pfragments[9])]) p = set(pairwise_substring_completion(self.fragment1, self.fragment4, self.ca, 3, 20)) assert (e == p) def test_different_and_longer(self): # fragment3 - fragment4 # fghi - abcde # expected: # fghi - abcd # fgh - abc # ghi - bcd # fghi - bcde # fgh - bcd # ghi - cde e = set([(self.pfragments[10], self.pfragments[5]), (self.pfragments[12], self.pfragments[7]), (self.pfragments[13], self.pfragments[8]), (self.pfragments[10], self.pfragments[6]), (self.pfragments[12], self.pfragments[8]), (self.pfragments[13], self.pfragments[9])]) p = set(pairwise_substring_completion(self.fragment3, self.fragment4, self.ca, 3, 20)) assert (e == p)
class TestSegmentAnnotation(object): tokenlist = (FragmentToken('a', Interval(0.0, 0.1), 'a'), FragmentToken('a', Interval(0.1, 0.2), 'r'), FragmentToken('a', Interval(0.2, 0.3), 'm'), FragmentToken('a', Interval(0.3, 0.4), 's'), FragmentToken('a', Interval(0.4, 0.5), 'a')) sa = SegmentAnnotation('name1', tokenlist) def test_restrict(self): db1 = IntervalDB({'a': [Interval(0, 0.5)]}) db2 = IntervalDB({'a': [Interval(0, 0.3)]}) assert (self.sa.restrict(db1) == self.sa) assert (self.sa.restrict(db2) == SegmentAnnotation( 'name1', self.tokenlist[:3])) def test_len(self): assert (len(self.sa) == 5) def test_iter(self): assert (list(iter(self.sa)) == list(self.tokenlist)) def test_get_item(self): for i in xrange(len(self.tokenlist)): assert (self.sa[i] == self.tokenlist[i]) def test_eq(self): assert (self.sa == self.sa) def test_eq_wrong_name(self): sa1 = SegmentAnnotation('name1', []) sa2 = SegmentAnnotation('name2', []) assert (sa1 != sa2) def test_eq_wrong_interval(self): sa1 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 1), None)]) sa2 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 3), None)]) assert (sa1 != sa2) def test_eq_wrong_ntokens(self): sa1 = SegmentAnnotation('name1', [FragmentToken('', Interval(0, 2), None)]) sa2 = SegmentAnnotation('name1', [ FragmentToken('', Interval(0, 1), None), FragmentToken('', Interval(1, 2), None) ]) assert (sa1 != sa2) def test_tokens_at_interval(self): assert (self.sa.tokens_at_interval(Interval(0.0, 0.5)) == tuple( self.tokenlist)) assert (self.sa.tokens_at_interval(Interval(0.1, 0.4)) == tuple( self.tokenlist[1:4])) assert (self.sa.tokens_at_interval(Interval( 0.0, 0.05)) == (self.tokenlist[0], )) assert (self.sa.tokens_at_interval(Interval(10, 11)) == tuple()) assert (SegmentAnnotation('', []).tokens_at_interval(Interval( 0, 1)) == tuple()) def test_annotation_at_interval(self): assert (self.sa.annotation_at_interval(Interval(0.0, 0.5)) == tuple( ['a', 'r', 'm', 's', 'a'])) assert (self.sa.annotation_at_interval(Interval(0.1, 0.4)) == tuple( ['r', 'm', 's'])) assert (self.sa.annotation_at_interval(Interval(0.0, 0.05)) == tuple(['a'])) assert (self.sa.annotation_at_interval(Interval(10, 11)) == tuple()) def test_empty(self): e = SegmentAnnotation('', []) assert (e.name == '') assert (e.interval is None) def test_non_contiguous(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('a', Interval(2, 3), None) ]) def test_different_names(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('b', Interval(1, 2), None) ])
def test_non_contiguous(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('a', Interval(2, 3), None) ])
def test_different_names(self): with pytest.raises(ValueError): SegmentAnnotation('', [ FragmentToken('a', Interval(0, 1), None), FragmentToken('b', Interval(1, 2), None) ])
class TestReadClasses(object): tiny_classes = """Class 0 f1 0.000 4.000 f2 0.000 4.000 Class 1 f1 1.000 4.000 f2 1.000 4.000 Class 2 f1 0.000 3.000 f2 0.000 3.000 """ tiny_corpus = """f1 0.000 1.000 a f1 1.000 2.000 b f1 2.000 3.000 c f1 3.000 4.000 d f2 0.000 1.000 a f2 1.000 2.000 b f2 2.000 3.000 c f2 3.000 4.000 d """ clsdict_e = { ClassID(0, None): (FragmentToken('f1', Interval(0.0, 4.0), None), FragmentToken('f2', Interval(0.0, 4.0), None)), ClassID(1, None): (FragmentToken('f1', Interval(1.0, 4.0), None), FragmentToken('f2', Interval(1.0, 4.0), None)), ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0), None), FragmentToken('f2', Interval(0.0, 3.0), None)) } clsdict_a = { ClassID(0, None): (FragmentToken('f1', Interval(0.0, 4.0), ('a', 'b', 'c', 'd')), FragmentToken('f2', Interval(0.0, 4.0), ('a', 'b', 'c', 'd'))), ClassID(1, None): (FragmentToken('f1', Interval(1.0, 4.0), ('b', 'c', 'd')), FragmentToken('f2', Interval(1.0, 4.0), ('b', 'c', 'd'))), ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0), ('a', 'b', 'c')), FragmentToken('f2', Interval(0.0, 3.0), ('a', 'b', 'c'))) } tokens = [ FragmentToken('f1', Interval(0.0, 1.0), 'a'), FragmentToken('f1', Interval(1.0, 2.0), 'b'), FragmentToken('f1', Interval(2.0, 3.0), 'c'), FragmentToken('f1', Interval(3.0, 4.0), 'd'), FragmentToken('f2', Interval(0.0, 1.0), 'a'), FragmentToken('f2', Interval(1.0, 2.0), 'b'), FragmentToken('f2', Interval(2.0, 3.0), 'c'), FragmentToken('f2', Interval(3.0, 4.0), 'd') ] corpus = Corpus([ SegmentAnnotation('f1', tokens[:4]), SegmentAnnotation('f2', tokens[4:]) ]) def test_small(self): assert (self.clsdict_e == read_classfile(self.tiny_classes)) def test_corpus(self): assert (self.corpus == tokenlists_to_corpus( read_annotation(self.tiny_corpus))) def test_annotate(self): assert (self.clsdict_a == annotate_classes( read_classfile(self.tiny_classes), tokenlists_to_corpus(read_annotation(self.tiny_corpus))))
def test_read_small(self): contents = """f1 0.000 0.100 a f1 0.100 0.200 r f1 0.200 0.300 m f1 0.300 0.400 s f1 0.400 0.500 a f1 0.700 0.800 w f1 0.800 0.900 o f1 0.900 1.000 r f1 1.000 1.100 m f1 1.100 1.200 s f1 1.200 1.300 a f2 0.100 0.200 w f2 0.200 0.300 o f2 0.300 0.400 r f2 0.400 0.500 d f2 0.500 0.600 s """ tokens = [ FragmentToken('f1', Interval(0.0, 0.1), 'a'), FragmentToken('f1', Interval(0.1, 0.2), 'r'), FragmentToken('f1', Interval(0.2, 0.3), 'm'), FragmentToken('f1', Interval(0.3, 0.4), 's'), FragmentToken('f1', Interval(0.4, 0.5), 'a'), FragmentToken('f1', Interval(0.7, 0.8), 'w'), FragmentToken('f1', Interval(0.8, 0.9), 'o'), FragmentToken('f1', Interval(0.9, 1.0), 'r'), FragmentToken('f1', Interval(1.0, 1.1), 'm'), FragmentToken('f1', Interval(1.1, 1.2), 's'), FragmentToken('f1', Interval(1.2, 1.3), 'a'), FragmentToken('f2', Interval(0.1, 0.2), 'w'), FragmentToken('f2', Interval(0.2, 0.3), 'o'), FragmentToken('f2', Interval(0.3, 0.4), 'r'), FragmentToken('f2', Interval(0.4, 0.5), 'd'), FragmentToken('f2', Interval(0.5, 0.6), 's') ] corpus = Corpus([ SegmentAnnotation('f1', tokens[0:5]), SegmentAnnotation('f1', tokens[5:11]), SegmentAnnotation('f2', tokens[11:]) ]) assert ([tokens[0:5], tokens[5:11], tokens[11:]] == read_annotation(contents)) assert (tokenlists_to_corpus(read_annotation(contents)) == corpus)
class TestClassDict(object): tokens = [ FragmentToken('a', Interval(0, 1), 'm1'), FragmentToken('b', Interval(2, 3), 'm1'), FragmentToken('c', Interval(2, 3), 'm1'), FragmentToken('b', Interval(0, 1), 'm2'), FragmentToken('c', Interval(0, 1), 'm2') ] id0 = ClassID(0, 'c1') id1 = ClassID(1, 'c2') d1 = {id0: (tokens[0], tokens[1])} d2 = {id0: (tokens[0], )} d3 = {id0: tuple()} d4 = {id0: (tokens[0], tokens[2]), id1: (tokens[3], tokens[4])} c1 = ClassDict(d1) c2 = ClassDict(d2) c3 = ClassDict(d3) c4 = ClassDict(d4) def test_restrict(self): db1 = IntervalDB({ 'a': [Interval(0, 1)], 'b': [Interval(0, 3)], 'c': [Interval(0, 3)] }) assert (self.c1.restrict(db1) == self.c1) assert (self.c2.restrict(db1) == self.c2) assert (self.c2.restrict(db1, remove_singletons=True) == ClassDict({})) assert (self.c3.restrict(db1) == ClassDict({})) assert (self.c4.restrict(db1) == self.c4) db2 = IntervalDB({'a': [Interval(0, 1)], 'c': [Interval(0, 3)]}) assert (self.c1.restrict(db2) == self.c2) assert (self.c2.restrict(db2) == self.c2) assert (self.c2.restrict(db2, remove_singletons=True) == ClassDict({})) assert (self.c3.restrict(db2) == ClassDict({})) assert (self.c4.restrict(db2) == ClassDict({ self.id0: (self.tokens[0], self.tokens[2]), self.id1: (self.tokens[4], ) })) assert (self.c4.restrict(db2, remove_singletons=True) == ClassDict( {self.id0: (self.tokens[0], self.tokens[2])})) def test_iter_fragments(self): assert (list( self.c1.iter_fragments()) == [self.tokens[0], self.tokens[1]]) assert (list(self.c2.iter_fragments()) == [self.tokens[0]]) assert (list(self.c3.iter_fragments()) == []) assert (list(self.c4.iter_fragments()) == [ self.tokens[0], self.tokens[2], self.tokens[3], self.tokens[4] ]) def test_iter_fragments_with_class(self): assert (list(self.c1.iter_fragments(with_class=True)) == [ (self.id0, self.tokens[0]), (self.id0, self.tokens[1]) ]) assert (list(self.c2.iter_fragments(with_class=True)) == [ (self.id0, self.tokens[0]) ]) assert (list(self.c3.iter_fragments(with_class=True)) == []) assert (list(self.c4.iter_fragments(with_class=True)) == [ (self.id0, self.tokens[0]), (self.id0, self.tokens[2]), (self.id1, self.tokens[3]), (self.id1, self.tokens[4]) ]) def test_iter_pairs_across_set(self): within = False order = False assert (list(self.c1.iter_pairs(within, order)) == [(self.tokens[0], self.tokens[1])]) assert (list(self.c2.iter_pairs(within, order)) == []) assert (list(self.c3.iter_pairs(within, order)) == []) assert (set(self.c4.iter_pairs(within, order)) == set([ (self.tokens[0], self.tokens[2]), (self.tokens[0], self.tokens[3]), (self.tokens[0], self.tokens[4]), (self.tokens[2], self.tokens[3]), (self.tokens[2], self.tokens[4]), (self.tokens[3], self.tokens[4]) ])) def test_iter_pairs_across_order(self): within = False order = True assert (set(self.c1.iter_pairs(within, order)) == set([ (self.tokens[0], self.tokens[1]), (self.tokens[1], self.tokens[0]) ])) assert (list(self.c2.iter_pairs(within, order)) == []) assert (list(self.c3.iter_pairs(within, order)) == []) assert (set(self.c4.iter_pairs(within, order)) == set([ (self.tokens[0], self.tokens[2]), (self.tokens[2], self.tokens[0]), (self.tokens[0], self.tokens[3]), (self.tokens[3], self.tokens[0]), (self.tokens[0], self.tokens[4]), (self.tokens[4], self.tokens[0]), (self.tokens[2], self.tokens[3]), (self.tokens[3], self.tokens[2]), (self.tokens[2], self.tokens[4]), (self.tokens[4], self.tokens[2]), (self.tokens[3], self.tokens[4]), (self.tokens[4], self.tokens[3]) ])) def test_iter_pairs_within_set(self): within = True order = False assert (set(self.c1.iter_pairs(within, order)) == set([ (self.tokens[0], self.tokens[1]) ])) assert (list(self.c2.iter_pairs(within, order)) == []) assert (list(self.c3.iter_pairs(within, order)) == []) assert (set(self.c4.iter_pairs(within, order)) == set([ (self.tokens[0], self.tokens[2]), (self.tokens[3], self.tokens[4]) ])) def test_iter_pairs_within_order(self): within = True order = True assert (set(self.c1.iter_pairs(within, order)) == set([ (self.tokens[0], self.tokens[1]), (self.tokens[1], self.tokens[0]) ])) assert (list(self.c2.iter_pairs(within, order)) == []) assert (list(self.c3.iter_pairs(within, order)) == []) assert (set(self.c4.iter_pairs(within, order)) == set([ (self.tokens[0], self.tokens[2]), (self.tokens[2], self.tokens[0]), (self.tokens[3], self.tokens[4]), (self.tokens[4], self.tokens[3]) ]))
def test_mark(self): ft = FragmentToken('name', Interval(0, 1), 'markymark') assert (ft.name == 'name') assert (ft.interval == Interval(0, 1)) assert (ft.mark == 'markymark')
def test_no_mark(self): ft = FragmentToken('name', Interval(0, 1), None) assert (ft.name == 'name') assert (ft.interval == Interval(0, 1)) assert (ft.mark is None)