Beispiel #1
0
class TestSegmentAnnotationCmp(object):
    sa1 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(0, 0.5), None)])
    sa2 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(0.5, 1.5), None)])
    sa3 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(1.3, 1.4), None)])
    sa4 = SegmentAnnotation('n2', [FragmentToken('n2', Interval(0, 1), None)])

    def test_invalid_comparison(self):
        with pytest.raises(ValueError):
            annotation_cmp(self.sa1, self.sa4)

    def test_annotation_eq(self):
        assert (annotation_cmp(self.sa1, self.sa1) == 0)
        assert (annotation_cmp(self.sa2, self.sa2) == 0)
        assert (annotation_cmp(self.sa3, self.sa3) == 0)

    def test_annotation_cmp(self):
        assert (annotation_cmp(self.sa1, self.sa2) == -1)
        assert (annotation_cmp(self.sa1, self.sa3) == -1)
        assert (annotation_cmp(self.sa2, self.sa1) == 1)
        assert (annotation_cmp(self.sa3, self.sa1) == 1)

        assert (annotation_cmp(self.sa2, self.sa3) == 0)
        assert (annotation_cmp(self.sa3, self.sa2) == 0)
 def test_interval_errors(self):
     assert (check_intervals(self.d2, self.m1) == ([
         FragmentToken('a', Interval(0.5, 1.5), 'm1')
     ], []))
     assert (check_intervals(self.d4, self.m1) == ([
         FragmentToken('a', Interval(0.5, 2.5), 'm1')
     ], []))
Beispiel #3
0
 def test_eq_wrong_ntokens(self):
     sa1 = SegmentAnnotation('name1',
                             [FragmentToken('', Interval(0, 2), None)])
     sa2 = SegmentAnnotation('name1', [
         FragmentToken('', Interval(0, 1), None),
         FragmentToken('', Interval(1, 2), None)
     ])
     assert (sa1 != sa2)
 def test_truncate_interval(self):
     assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({
         ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                          ('c', 'd')), )
     }), [], []))
     assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({
         ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                          ('c', 'd')), )
     }), [], []))
Beispiel #5
0
def test_typeset():
    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n1)),
              FragmentToken(None, Interval(0, 1), 'n{0}'.format(n2)))
             for n1, n2 in zip(xrange(10), xrange(10, 20))]
    assert (set(list(
        typeset(pairs))) == set(['m{0}'.format(n) for n in xrange(10)] +
                                ['n{0}'.format(n) for n in xrange(10, 20)]))
    pairs = []
    assert (set(list(typeset(pairs))) == set())

    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)),
              FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)))
             for n in xrange(10)]
    assert (set(typeset(pairs)) == set('m{0}'.format(n) for n in xrange(10)))
class TestCheckTruncateIntervals(object):
    m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]})
    d1 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )})
    d2 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )})
    d3 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )})
    d4 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )})
    sa = [
        SegmentAnnotation('a', [
            FragmentToken('a', Interval(0.0, 0.25), 'a'),
            FragmentToken('a', Interval(0.25, 0.5), 'b'),
            FragmentToken('a', Interval(0.5, 0.75), 'c'),
            FragmentToken('a', Interval(0.75, 1.0), 'd')
        ])
    ]
    ca = Corpus(sa)

    def test_good_interval(self):
        assert (truncate_intervals(self.d1, self.ca,
                                   self.m1) == (self.d1, [], []))

    def test_truncate_interval(self):
        assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({
            ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                             ('c', 'd')), )
        }), [], []))
        assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({
            ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                             ('c', 'd')), )
        }), [], []))
Beispiel #7
0
def pairwise_substring_completion(fragment1, fragment2, corpus, minlength,
                                  maxlength):
    name1, name2 = fragment1.name, fragment2.name
    tokenseq1 = [(f.mark, f.interval)
                 for f in corpus.tokens(name1, fragment1.interval)]
    tokenseq2 = [(f.mark, f.interval)
                 for f in corpus.tokens(name2, fragment2.interval)]

    for seq1, seq2 in psubstrings(tokenseq1, tokenseq2, minlength, maxlength):
        submark1, intervalseq1 = zip(*seq1)
        submark2, intervalseq2 = zip(*seq2)
        interval1 = Interval(intervalseq1[0].start, intervalseq1[-1].end)
        interval2 = Interval(intervalseq2[0].start, intervalseq2[-1].end)
        yield (FragmentToken(name1, interval1, submark1),
               FragmentToken(name2, interval2, submark2))
Beispiel #8
0
def test_freqs():
    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n1)),
              FragmentToken(None, Interval(0, 1), 'n{0}'.format(n2)))
             for n1, n2 in zip(xrange(10), xrange(10, 20))]
    assert (freqs(pairs) == dict({'m{0}'.format(n): 1
                                  for n in xrange(10)}.items() +
                                 {'n{0}'.format(n): 1
                                  for n in xrange(10, 20)}.items()))
    pairs = []
    assert (freqs(pairs) == dict())

    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)),
              FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)))
             for n in xrange(10)]
    assert (freqs(pairs) == {'m{0}'.format(n): 1 for n in xrange(10)})
Beispiel #9
0
def annotate_classes(clsdict, corpus, split=None):
    new = {}  # with annotation
    errors = []
    check_split = not (split is None)
    for classID, tokenlist in clsdict.iteritems():
        newtokens = []
        for token in tokenlist:
            filename = token.name
            interval = token.interval
            if check_split and not split.is_covered(filename, interval):
                errors.append(token)
                try:
                    finterval = split.largest_overlap(filename, interval)
                    qstart, qend = interval
                    fstart, fend = finterval
                    if fstart != qstart or fstart != qend:
                        newstart = max(fstart, qstart)
                        newend = min(fend, qend)
                        interval = Interval(newstart, newend)
                except KeyError:
                    continue
                except ValueError:
                    continue
            try:
                annot = tuple(corpus.annotation(filename, interval))
            except:
                continue
            newtokens.append(FragmentToken(filename, interval, annot))
        if len(newtokens) > 0:
            newtokens = tuple(newtokens)
            new[classID] = newtokens
    return ClassDict(new), errors
Beispiel #10
0
def truncate_intervals(clsdict, corpus, mapping):
    disc = {}
    interval_errors = []
    filename_errors = []
    for class_id in clsdict:
        fragments = []
        for fragment in clsdict[class_id]:
            qname = fragment.name
            qstart = fragment.interval.start
            qend = fragment.interval.end
            try:
                finterval = mapping.largest_overlap(qname, fragment.interval)
            except KeyError:
                filename_errors.append(fragment.name)
                continue
            except ValueError:
                interval_errors.append(fragment)
            fstart, fend = finterval
            if qstart != fstart or qend != fend:
                newstart = max(qstart, fstart)
                newend = min(qend, fend)
                newinterval = Interval(newstart, newend)
                newmark = corpus.annotation(qname, newinterval)
                fragment = FragmentToken(qname, newinterval, newmark)
            fragments.append(fragment)
        disc[class_id] = tuple(fragments)
    return ClassDict(disc), filename_errors, interval_errors
Beispiel #11
0
def load_alignment(fname, strip_tags=True):
    """Loads a .ctm alignment file into FragmentTokens."""
    fragment_lists = []
    fragments = []
    previous_name = ""
    
    for line in open(fname):
        name, _, start, duration, mark = line.strip().split(' ')
        if name != previous_name:
            if fragments != []:
                fragment_lists.append(fragments)
            fragments = []
        previous_name = name
        start = round(float(start), 2)
        stop = start + round(float(duration), 2)
        interval = Interval(start, stop)

        if "phone" in fname and strip_tags:
            mark = mark.split('_')[0]
        
        fragment = FragmentToken(name, interval, mark)
        fragments.append(fragment)
    if fragments != []:
        fragment_lists.append(fragments)
        
    # Phone and word alignments aren't necessarily in the same order, so sort.
    fragment_lists.sort()
    return fragment_lists
Beispiel #12
0
    def tokens_at_interval(self, interval):
        """
        Get the annotation tokens corresponding to an interval.

        Parameters
        ----------
        interval : Interval

        Returns
        -------
        tuple of FragmentTokens
            FragmentTokens covered by the interval.
        """
        if len(self.tokens) > 0:
            name = self.tokens[0].name
        else:
            return tuple()
        dummy_token = FragmentToken(name, interval, None)
        try:
            start = self.tokens.index_ge(dummy_token)
        except ValueError:
            return tuple()
        try:
            stop = self.tokens.index_gt(dummy_token)
        except ValueError:
            stop = len(self.tokens)
        return tuple([x for x in self.tokens[start:stop]])
Beispiel #13
0
    def tokens(self, name, interval):
        """
        Find the FragmentTokens covering an interval.

        Parameters
        ----------
        name : string
            Identifier.
        interval : Interval
            Time segment.

        Returns
        -------
        list of tokens
            FragmentTokens covered by the interval.
        """
        key = (name, interval)
        if not key in self._cache:
            try:
                fa_for_filename = self[name]
            except KeyError:
                raise KeyError('no such name: {0}'.format(name))
            dummy_token = FragmentToken(name, interval, None)
            try:
                fa = fa_for_filename.find_le(dummy_token)
            except ValueError:
                raise ValueError('interval not found: {0}'.format(str(interval)))
            if (fa.interval.overlap(interval)) > 0:
                self._cache[key] = fa.tokens_at_interval(interval)
            else:
                raise ValueError('interval not found: {0}'.format(str(interval)))
        return self._cache[key]
def load_annot(fname):
    fs = []
    bname = path.splitext(path.basename(fname))[0]
    for line in open(fname):
        start, stop, mark = line.strip().split(' ')
        interval = Interval(round(float(start), 2), round(float(stop), 2))
        fragment = FragmentToken(bname, interval, mark)
        fs.append(fragment)
    return fs
Beispiel #15
0
class TestFragmentType(object):
    tokens = [
        FragmentToken('a', Interval(0.0, 0.1), 'a'),
        FragmentToken('a', Interval(0.1, 0.2), 'r'),
        FragmentToken('a', Interval(0.2, 0.3), 'm'),
        FragmentToken('a', Interval(0.3, 0.4), 's'),
        FragmentToken('a', Interval(0.4, 0.5), 'a')
    ]

    def test_mark(self):
        ft = FragmentType(self.tokens, 'markymark')
        assert (ft.tokens == self.tokens)
        assert (ft.mark == 'markymark')

    def test_no_mark(self):
        ft = FragmentType(self.tokens, None)
        assert (ft.tokens == self.tokens)
        assert (ft.mark is None)
Beispiel #16
0
class TestTokenCmp(object):
    f1 = FragmentToken('a', Interval(0.0, 0.5), None)
    f2 = FragmentToken('a', Interval(0.5, 1.5), None)
    f3 = FragmentToken('a', Interval(1.3, 1.4), None)
    f4 = FragmentToken('b', Interval(0, 1), None)

    def test_invalid_comparison(self):
        with pytest.raises(ValueError):
            token_cmp(self.f1, self.f4)

    def test_token_eq(self):
        assert (token_cmp(self.f1, self.f1) == 0)
        assert (token_cmp(self.f2, self.f2) == 0)
        assert (token_cmp(self.f3, self.f3) == 0)

    def test_token_cmp(self):
        assert (token_cmp(self.f1, self.f2) == -1)
        assert (token_cmp(self.f1, self.f3) == -1)
        assert (token_cmp(self.f2, self.f1) == 1)
        assert (token_cmp(self.f3, self.f1) == 1)

        assert (token_cmp(self.f2, self.f3) == 0)
        assert (token_cmp(self.f3, self.f2) == 0)
Beispiel #17
0
def load_match_file(match_fn, phn_corpus):
    with open(match_fn) as f:
        matches = []
        for line in f:
            # if len(matches) > 5000:
            #     break
            fields = line.strip().split()
            if len(fields) == 2:
                base1, base2 = fields
            elif len(fields) == 6:
                dtw = float(fields[4])
                start1, end1, start2, end2 = map(
                    lambda x: float(x) / 100.0, fields[:4])
                interval1 = Interval(start1, end1)
                interval2 = Interval(start2, end2)
                fragment1 = FragmentToken(
                    base1, interval1, phn_corpus.annotation(base1, interval1))
                fragment2 = FragmentToken(
                    base2, interval2, phn_corpus.annotation(base2, interval2))
                matches.append(Match(fragment1, fragment2, dtw))

        random.shuffle(matches)
        return matches[:100000]
Beispiel #18
0
def extract_single(tokens1, tokens2, minlength, maxlength, same):
    """Extract gold alignments between two phone lists.

    Parameters
    ----------
    tokens1, tokens2 : list of FragmentTokens
    minlength : int
        Minimum number of symbols in a fragment
    same : boolean
        Whether `tokens1` and `tokens2` are identical.

    Returns
    -------
    l : list of (FragmentToken, FragmentToken)
        List of token pairs containing the cooccurring fragments

    """
    ids1, intervals1, phones1 = zip(*tokens1)
    ids2, intervals2, phones2 = zip(*tokens2)
    id1 = ids1[0]  # ids are all the same
    id2 = ids2[0]
    css = allcommonsubstrings(phones1, phones2,
                              minlength=minlength, maxlength=maxlength,
                              same=same)
    if css is None:
        return []
    r = []
    for slice1, slice2 in css:
        r.append((FragmentToken(id1,
                                Interval(intervals1[slice1.start].start,
                                         intervals1[slice1.stop - 1].end),
                                phones1[slice1]),
                  FragmentToken(id2,
                                Interval(intervals2[slice2.start].start,
                                         intervals2[slice2.stop - 1].end),
                                phones2[slice2])))
    return r
Beispiel #19
0
class TestCheckIntervals(object):
    m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]})
    d1 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )})
    d2 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )})
    d3 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )})
    d4 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )})

    def test_good_interval(self):
        assert (check_intervals(self.d1, self.m1) == ([], []))

    def test_interval_errors(self):
        assert (check_intervals(self.d2, self.m1) == ([
            FragmentToken('a', Interval(0.5, 1.5), 'm1')
        ], []))
        assert (check_intervals(self.d4, self.m1) == ([
            FragmentToken('a', Interval(0.5, 2.5), 'm1')
        ], []))

    def test_bad_filename(self):
        assert (check_intervals(self.d3, self.m1) == ([], ['b']))
Beispiel #20
0
def read_classfile(contents):
    """Read in class file.

    Parameters
    ----------
    contents : string

    Returns
    -------
    r : dict from ClassID to list of FragmentToken

    """
    classp = re.compile(r"^Class (?P<classID>\d+)(?: (?P<mark>.+))?$")
    r = {}
    curr = []  # list of FragmentTokens without mark
    curr_class = None

    for lineno, line in enumerate(contents.split('\n')):
        m = re.match(classp, line)
        if m:  # on a line with a class label
            if curr_class is None:
                curr_class = ClassID(int(m.group('classID')), m.group('mark'))
            else:
                raise ValueError('new class while reading class')
        else:  # on an interval line or a whitespace line
            if len(line.strip()) > 0:
                split = line.strip().split(' ')
                name = split[0]
                start = float(split[1])
                end = float(split[2])
                interval = Interval(start, end)
                curr.append(FragmentToken(name, interval, None))
            else:  # whitespace line, reset
                if curr_class is None:
                    continue
                    # if lineno == 0:
                    #     continue
                    # print lineno, line
                    # raise ValueError('attempting to end reading class '
                    #                  'while not reading class in line {0}'
                    #                  .format(lineno))
                r[curr_class] = tuple(curr)
                curr = []
                curr_class = None
    if not curr_class is None:
        r[curr_class] = tuple(curr)
    return r
Beispiel #21
0
def read_annotation(contents):
    ID_prev = None
    interval_prev = None
    r = []
    tokenlist_curr = []
    for line_idx, line in enumerate(contents.split('\n')):
        if line == '':
            continue
        try:
            ID_curr, start, stop, mark = line.strip().split(' ')
        except ValueError:
            raise ReadError('badly formatted line {1}: {0}'.format(
                line, line_idx))
        try:
            start = float(start)
            stop = float(stop)
        except ValueError:
            raise ReadError(
                'could not convert string to float in line {1}: {0}'.format(
                    line, line_idx))
        try:
            interval_curr = Interval(start, stop)
        except ValueError:
            raise ReadError(
                'invalid interval in line {0}: ({1:.3f} {2:.3f})'.format(
                    line_idx, start, stop))

        token = FragmentToken(ID_curr, interval_curr, mark)

        if ID_prev is None:
            tokenlist_curr = [token]
            ID_prev = ID_curr
        elif ID_prev == ID_curr:
            if interval_prev.is_left_adjacent_to(interval_curr):
                tokenlist_curr.append(token)
            else:
                r.append(tokenlist_curr)
                tokenlist_curr = [token]
        else:  # ID_prev != ID_curr
            r.append(tokenlist_curr)
            tokenlist_curr = [token]
            ID_prev = ID_curr
        interval_prev = interval_curr
    r.append(tokenlist_curr)
    return r
Beispiel #22
0
class TestPairwiseSubstringCompletion(object):
    fragments = [FragmentToken('a', Interval(0.0, 0.25), 'a'),
                 FragmentToken('a', Interval(0.25, 0.5), 'b'),
                 FragmentToken('a', Interval(0.5, 0.75), 'c'),
                 FragmentToken('a', Interval(0.75, 1.0), 'd'),
                 FragmentToken('a', Interval(1.0, 1.25), 'e'),

                 FragmentToken('b', Interval(0.0, 0.25), 'a'),
                 FragmentToken('b', Interval(0.25, 0.5), 'b'),
                 FragmentToken('b', Interval(0.5, 0.75), 'c'),
                 FragmentToken('b', Interval(0.75, 1.0), 'd'),
                 FragmentToken('b', Interval(1.0, 1.25), 'e'),

                 FragmentToken('c', Interval(0.0, 0.25), 'f'),
                 FragmentToken('c', Interval(0.25, 0.5), 'g'),
                 FragmentToken('c', Interval(0.5, 0.75), 'h'),
                 FragmentToken('c', Interval(0.75, 1.0), 'i'),
                 FragmentToken('c', Interval(1.0, 1.25), 'j')]
    sa = [SegmentAnnotation('a', fragments[:5]),
          SegmentAnnotation('b', fragments[5:10]),
          SegmentAnnotation('c', fragments[10:])]
    ca = Corpus(sa)
    fragment1 = FragmentToken('a', Interval(0.0, 1.0), None)
    fragment2 = FragmentToken('b', Interval(0.0, 1.0), None)
    fragment3 = FragmentToken('c', Interval(0.0, 1.0), None)
    fragment4 = FragmentToken('b', Interval(0.0, 1.25), None)

    pfragments = [FragmentToken('a', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')),
                  FragmentToken('a', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')),
                  FragmentToken('a', Interval(0.0, 0.75), ('a', 'b', 'c')),
                  FragmentToken('a', Interval(0.25, 1.0), ('b', 'c', 'd')),
                  FragmentToken('a', Interval(0.5, 1.25), ('c', 'd', 'e')),

                  FragmentToken('b', Interval(0.0, 1.0), ('a', 'b', 'c', 'd')),
                  FragmentToken('b', Interval(0.25, 1.25), ('b', 'c', 'd', 'e')),
                  FragmentToken('b', Interval(0.0, 0.75), ('a', 'b', 'c')),
                  FragmentToken('b', Interval(0.25, 1.0), ('b', 'c', 'd')),
                  FragmentToken('b', Interval(0.5, 1.25), ('c', 'd', 'e')),

                  FragmentToken('c', Interval(0.0, 1.0), ('f', 'g', 'h', 'i')),
                  FragmentToken('c', Interval(0.25, 1.25), ('g', 'h', 'i', 'j')),
                  FragmentToken('c', Interval(0.0, 0.75), ('f', 'g', 'h')),
                  FragmentToken('c', Interval(0.25, 1.0), ('g', 'h', 'i')),
                  FragmentToken('c', Interval(0.5, 1.25), ('h', 'i', 'j'))]


    def test_same(self):
        # fragment1 - fragment2
        # abcd - abcd

        # expected:
        # abcd - abcd
        # abc - abc
        # bcd - bcd
        e = set([(self.pfragments[0], self.pfragments[5]),
                 (self.pfragments[2], self.pfragments[7]),
                 (self.pfragments[3], self.pfragments[8])])
        p = set(pairwise_substring_completion(self.fragment1,
                                              self.fragment2,
                                              self.ca, 3, 20))
        assert(p == e)

    def test_different(self):
        # fragment1 - fragment3
        # abcd - fghi

        # expected:
        # abcd - fghi
        # abc - fgh
        # bcd - ghi
        e = set([(self.pfragments[0], self.pfragments[10]),
                 (self.pfragments[2], self.pfragments[12]),
                 (self.pfragments[3], self.pfragments[13])])
        p = set(pairwise_substring_completion(self.fragment1,
                                              self.fragment3,
                                              self.ca, 3, 20))
        assert(e == p)

    def test_longer(self):
        # fragment1 - fragment4
        # abcd - abcde

        # expected:

        # abcd - abcd
        # abc - abc
        # bcd - bcd

        # abcd - bcde
        # abc - bcd
        # bcd - cde
        e = set([(self.pfragments[0], self.pfragments[5]),
                 (self.pfragments[2], self.pfragments[7]),
                 (self.pfragments[3], self.pfragments[8]),

                 (self.pfragments[0], self.pfragments[6]),
                 (self.pfragments[2], self.pfragments[8]),
                 (self.pfragments[3], self.pfragments[9])])
        p = set(pairwise_substring_completion(self.fragment1,
                                              self.fragment4,
                                              self.ca, 3, 20))
        assert (e == p)

    def test_different_and_longer(self):
        # fragment3 - fragment4
        # fghi - abcde

        # expected:
        # fghi - abcd
        # fgh - abc
        # ghi - bcd

        # fghi - bcde
        # fgh - bcd
        # ghi - cde
        e = set([(self.pfragments[10], self.pfragments[5]),
                 (self.pfragments[12], self.pfragments[7]),
                 (self.pfragments[13], self.pfragments[8]),
                 (self.pfragments[10], self.pfragments[6]),
                 (self.pfragments[12], self.pfragments[8]),
                 (self.pfragments[13], self.pfragments[9])])
        p = set(pairwise_substring_completion(self.fragment3,
                                              self.fragment4,
                                              self.ca, 3, 20))
        assert (e == p)
Beispiel #23
0
class TestSegmentAnnotation(object):
    tokenlist = (FragmentToken('a', Interval(0.0, 0.1), 'a'),
                 FragmentToken('a', Interval(0.1, 0.2),
                               'r'), FragmentToken('a', Interval(0.2, 0.3),
                                                   'm'),
                 FragmentToken('a', Interval(0.3, 0.4),
                               's'), FragmentToken('a', Interval(0.4, 0.5),
                                                   'a'))
    sa = SegmentAnnotation('name1', tokenlist)

    def test_restrict(self):
        db1 = IntervalDB({'a': [Interval(0, 0.5)]})
        db2 = IntervalDB({'a': [Interval(0, 0.3)]})
        assert (self.sa.restrict(db1) == self.sa)
        assert (self.sa.restrict(db2) == SegmentAnnotation(
            'name1', self.tokenlist[:3]))

    def test_len(self):
        assert (len(self.sa) == 5)

    def test_iter(self):
        assert (list(iter(self.sa)) == list(self.tokenlist))

    def test_get_item(self):
        for i in xrange(len(self.tokenlist)):
            assert (self.sa[i] == self.tokenlist[i])

    def test_eq(self):
        assert (self.sa == self.sa)

    def test_eq_wrong_name(self):
        sa1 = SegmentAnnotation('name1', [])
        sa2 = SegmentAnnotation('name2', [])
        assert (sa1 != sa2)

    def test_eq_wrong_interval(self):
        sa1 = SegmentAnnotation('name1',
                                [FragmentToken('', Interval(0, 1), None)])
        sa2 = SegmentAnnotation('name1',
                                [FragmentToken('', Interval(0, 3), None)])
        assert (sa1 != sa2)

    def test_eq_wrong_ntokens(self):
        sa1 = SegmentAnnotation('name1',
                                [FragmentToken('', Interval(0, 2), None)])
        sa2 = SegmentAnnotation('name1', [
            FragmentToken('', Interval(0, 1), None),
            FragmentToken('', Interval(1, 2), None)
        ])
        assert (sa1 != sa2)

    def test_tokens_at_interval(self):
        assert (self.sa.tokens_at_interval(Interval(0.0, 0.5)) == tuple(
            self.tokenlist))
        assert (self.sa.tokens_at_interval(Interval(0.1, 0.4)) == tuple(
            self.tokenlist[1:4]))
        assert (self.sa.tokens_at_interval(Interval(
            0.0, 0.05)) == (self.tokenlist[0], ))
        assert (self.sa.tokens_at_interval(Interval(10, 11)) == tuple())
        assert (SegmentAnnotation('', []).tokens_at_interval(Interval(
            0, 1)) == tuple())

    def test_annotation_at_interval(self):
        assert (self.sa.annotation_at_interval(Interval(0.0, 0.5)) == tuple(
            ['a', 'r', 'm', 's', 'a']))
        assert (self.sa.annotation_at_interval(Interval(0.1, 0.4)) == tuple(
            ['r', 'm', 's']))
        assert (self.sa.annotation_at_interval(Interval(0.0,
                                                        0.05)) == tuple(['a']))
        assert (self.sa.annotation_at_interval(Interval(10, 11)) == tuple())

    def test_empty(self):
        e = SegmentAnnotation('', [])
        assert (e.name == '')
        assert (e.interval is None)

    def test_non_contiguous(self):
        with pytest.raises(ValueError):
            SegmentAnnotation('', [
                FragmentToken('a', Interval(0, 1), None),
                FragmentToken('a', Interval(2, 3), None)
            ])

    def test_different_names(self):
        with pytest.raises(ValueError):
            SegmentAnnotation('', [
                FragmentToken('a', Interval(0, 1), None),
                FragmentToken('b', Interval(1, 2), None)
            ])
Beispiel #24
0
 def test_non_contiguous(self):
     with pytest.raises(ValueError):
         SegmentAnnotation('', [
             FragmentToken('a', Interval(0, 1), None),
             FragmentToken('a', Interval(2, 3), None)
         ])
Beispiel #25
0
 def test_different_names(self):
     with pytest.raises(ValueError):
         SegmentAnnotation('', [
             FragmentToken('a', Interval(0, 1), None),
             FragmentToken('b', Interval(1, 2), None)
         ])
Beispiel #26
0
class TestReadClasses(object):
    tiny_classes = """Class 0
f1 0.000 4.000
f2 0.000 4.000

Class 1
f1 1.000 4.000
f2 1.000 4.000

Class 2
f1 0.000 3.000
f2 0.000 3.000


"""
    tiny_corpus = """f1 0.000 1.000 a
f1 1.000 2.000 b
f1 2.000 3.000 c
f1 3.000 4.000 d
f2 0.000 1.000 a
f2 1.000 2.000 b
f2 2.000 3.000 c
f2 3.000 4.000 d

"""
    clsdict_e = {
        ClassID(0, None):
        (FragmentToken('f1', Interval(0.0, 4.0),
                       None), FragmentToken('f2', Interval(0.0, 4.0), None)),
        ClassID(1, None): (FragmentToken('f1', Interval(1.0, 4.0), None),
                           FragmentToken('f2', Interval(1.0, 4.0), None)),
        ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0), None),
                           FragmentToken('f2', Interval(0.0, 3.0), None))
    }
    clsdict_a = {
        ClassID(0, None):
        (FragmentToken('f1', Interval(0.0, 4.0), ('a', 'b', 'c', 'd')),
         FragmentToken('f2', Interval(0.0, 4.0), ('a', 'b', 'c', 'd'))),
        ClassID(1, None):
        (FragmentToken('f1', Interval(1.0, 4.0), ('b', 'c', 'd')),
         FragmentToken('f2', Interval(1.0, 4.0), ('b', 'c', 'd'))),
        ClassID(2, None): (FragmentToken('f1', Interval(0.0, 3.0),
                                         ('a', 'b', 'c')),
                           FragmentToken('f2', Interval(0.0, 3.0),
                                         ('a', 'b', 'c')))
    }
    tokens = [
        FragmentToken('f1', Interval(0.0, 1.0), 'a'),
        FragmentToken('f1', Interval(1.0, 2.0), 'b'),
        FragmentToken('f1', Interval(2.0, 3.0), 'c'),
        FragmentToken('f1', Interval(3.0, 4.0), 'd'),
        FragmentToken('f2', Interval(0.0, 1.0), 'a'),
        FragmentToken('f2', Interval(1.0, 2.0), 'b'),
        FragmentToken('f2', Interval(2.0, 3.0), 'c'),
        FragmentToken('f2', Interval(3.0, 4.0), 'd')
    ]
    corpus = Corpus([
        SegmentAnnotation('f1', tokens[:4]),
        SegmentAnnotation('f2', tokens[4:])
    ])

    def test_small(self):
        assert (self.clsdict_e == read_classfile(self.tiny_classes))

    def test_corpus(self):
        assert (self.corpus == tokenlists_to_corpus(
            read_annotation(self.tiny_corpus)))

    def test_annotate(self):
        assert (self.clsdict_a == annotate_classes(
            read_classfile(self.tiny_classes),
            tokenlists_to_corpus(read_annotation(self.tiny_corpus))))
Beispiel #27
0
    def test_read_small(self):
        contents = """f1 0.000 0.100 a
f1 0.100 0.200 r
f1 0.200 0.300 m
f1 0.300 0.400 s
f1 0.400 0.500 a
f1 0.700 0.800 w
f1 0.800 0.900 o
f1 0.900 1.000 r
f1 1.000 1.100 m
f1 1.100 1.200 s
f1 1.200 1.300 a
f2 0.100 0.200 w
f2 0.200 0.300 o
f2 0.300 0.400 r
f2 0.400 0.500 d
f2 0.500 0.600 s
"""
        tokens = [
            FragmentToken('f1', Interval(0.0, 0.1), 'a'),
            FragmentToken('f1', Interval(0.1, 0.2), 'r'),
            FragmentToken('f1', Interval(0.2, 0.3), 'm'),
            FragmentToken('f1', Interval(0.3, 0.4), 's'),
            FragmentToken('f1', Interval(0.4, 0.5), 'a'),
            FragmentToken('f1', Interval(0.7, 0.8), 'w'),
            FragmentToken('f1', Interval(0.8, 0.9), 'o'),
            FragmentToken('f1', Interval(0.9, 1.0), 'r'),
            FragmentToken('f1', Interval(1.0, 1.1), 'm'),
            FragmentToken('f1', Interval(1.1, 1.2), 's'),
            FragmentToken('f1', Interval(1.2, 1.3), 'a'),
            FragmentToken('f2', Interval(0.1, 0.2), 'w'),
            FragmentToken('f2', Interval(0.2, 0.3), 'o'),
            FragmentToken('f2', Interval(0.3, 0.4), 'r'),
            FragmentToken('f2', Interval(0.4, 0.5), 'd'),
            FragmentToken('f2', Interval(0.5, 0.6), 's')
        ]
        corpus = Corpus([
            SegmentAnnotation('f1', tokens[0:5]),
            SegmentAnnotation('f1', tokens[5:11]),
            SegmentAnnotation('f2', tokens[11:])
        ])

        assert ([tokens[0:5], tokens[5:11],
                 tokens[11:]] == read_annotation(contents))
        assert (tokenlists_to_corpus(read_annotation(contents)) == corpus)
Beispiel #28
0
class TestClassDict(object):
    tokens = [
        FragmentToken('a', Interval(0, 1), 'm1'),
        FragmentToken('b', Interval(2, 3), 'm1'),
        FragmentToken('c', Interval(2, 3), 'm1'),
        FragmentToken('b', Interval(0, 1), 'm2'),
        FragmentToken('c', Interval(0, 1), 'm2')
    ]
    id0 = ClassID(0, 'c1')
    id1 = ClassID(1, 'c2')

    d1 = {id0: (tokens[0], tokens[1])}
    d2 = {id0: (tokens[0], )}
    d3 = {id0: tuple()}
    d4 = {id0: (tokens[0], tokens[2]), id1: (tokens[3], tokens[4])}
    c1 = ClassDict(d1)
    c2 = ClassDict(d2)
    c3 = ClassDict(d3)
    c4 = ClassDict(d4)

    def test_restrict(self):
        db1 = IntervalDB({
            'a': [Interval(0, 1)],
            'b': [Interval(0, 3)],
            'c': [Interval(0, 3)]
        })
        assert (self.c1.restrict(db1) == self.c1)
        assert (self.c2.restrict(db1) == self.c2)
        assert (self.c2.restrict(db1, remove_singletons=True) == ClassDict({}))
        assert (self.c3.restrict(db1) == ClassDict({}))
        assert (self.c4.restrict(db1) == self.c4)

        db2 = IntervalDB({'a': [Interval(0, 1)], 'c': [Interval(0, 3)]})
        assert (self.c1.restrict(db2) == self.c2)
        assert (self.c2.restrict(db2) == self.c2)
        assert (self.c2.restrict(db2, remove_singletons=True) == ClassDict({}))
        assert (self.c3.restrict(db2) == ClassDict({}))
        assert (self.c4.restrict(db2) == ClassDict({
            self.id0: (self.tokens[0], self.tokens[2]),
            self.id1: (self.tokens[4], )
        }))
        assert (self.c4.restrict(db2, remove_singletons=True) == ClassDict(
            {self.id0: (self.tokens[0], self.tokens[2])}))

    def test_iter_fragments(self):
        assert (list(
            self.c1.iter_fragments()) == [self.tokens[0], self.tokens[1]])
        assert (list(self.c2.iter_fragments()) == [self.tokens[0]])
        assert (list(self.c3.iter_fragments()) == [])
        assert (list(self.c4.iter_fragments()) == [
            self.tokens[0], self.tokens[2], self.tokens[3], self.tokens[4]
        ])

    def test_iter_fragments_with_class(self):
        assert (list(self.c1.iter_fragments(with_class=True)) == [
            (self.id0, self.tokens[0]), (self.id0, self.tokens[1])
        ])
        assert (list(self.c2.iter_fragments(with_class=True)) == [
            (self.id0, self.tokens[0])
        ])
        assert (list(self.c3.iter_fragments(with_class=True)) == [])
        assert (list(self.c4.iter_fragments(with_class=True)) == [
            (self.id0, self.tokens[0]), (self.id0, self.tokens[2]),
            (self.id1, self.tokens[3]), (self.id1, self.tokens[4])
        ])

    def test_iter_pairs_across_set(self):
        within = False
        order = False
        assert (list(self.c1.iter_pairs(within, order)) == [(self.tokens[0],
                                                             self.tokens[1])])
        assert (list(self.c2.iter_pairs(within, order)) == [])
        assert (list(self.c3.iter_pairs(within, order)) == [])
        assert (set(self.c4.iter_pairs(within, order)) == set([
            (self.tokens[0], self.tokens[2]), (self.tokens[0], self.tokens[3]),
            (self.tokens[0], self.tokens[4]), (self.tokens[2], self.tokens[3]),
            (self.tokens[2], self.tokens[4]), (self.tokens[3], self.tokens[4])
        ]))

    def test_iter_pairs_across_order(self):
        within = False
        order = True
        assert (set(self.c1.iter_pairs(within, order)) == set([
            (self.tokens[0], self.tokens[1]), (self.tokens[1], self.tokens[0])
        ]))
        assert (list(self.c2.iter_pairs(within, order)) == [])
        assert (list(self.c3.iter_pairs(within, order)) == [])
        assert (set(self.c4.iter_pairs(within, order)) == set([
            (self.tokens[0], self.tokens[2]), (self.tokens[2], self.tokens[0]),
            (self.tokens[0], self.tokens[3]), (self.tokens[3], self.tokens[0]),
            (self.tokens[0], self.tokens[4]), (self.tokens[4], self.tokens[0]),
            (self.tokens[2], self.tokens[3]), (self.tokens[3], self.tokens[2]),
            (self.tokens[2], self.tokens[4]), (self.tokens[4], self.tokens[2]),
            (self.tokens[3], self.tokens[4]), (self.tokens[4], self.tokens[3])
        ]))

    def test_iter_pairs_within_set(self):
        within = True
        order = False
        assert (set(self.c1.iter_pairs(within, order)) == set([
            (self.tokens[0], self.tokens[1])
        ]))
        assert (list(self.c2.iter_pairs(within, order)) == [])
        assert (list(self.c3.iter_pairs(within, order)) == [])
        assert (set(self.c4.iter_pairs(within, order)) == set([
            (self.tokens[0], self.tokens[2]), (self.tokens[3], self.tokens[4])
        ]))

    def test_iter_pairs_within_order(self):
        within = True
        order = True
        assert (set(self.c1.iter_pairs(within, order)) == set([
            (self.tokens[0], self.tokens[1]), (self.tokens[1], self.tokens[0])
        ]))
        assert (list(self.c2.iter_pairs(within, order)) == [])
        assert (list(self.c3.iter_pairs(within, order)) == [])
        assert (set(self.c4.iter_pairs(within, order)) == set([
            (self.tokens[0], self.tokens[2]), (self.tokens[2], self.tokens[0]),
            (self.tokens[3], self.tokens[4]), (self.tokens[4], self.tokens[3])
        ]))
Beispiel #29
0
 def test_mark(self):
     ft = FragmentToken('name', Interval(0, 1), 'markymark')
     assert (ft.name == 'name')
     assert (ft.interval == Interval(0, 1))
     assert (ft.mark == 'markymark')
Beispiel #30
0
 def test_no_mark(self):
     ft = FragmentToken('name', Interval(0, 1), None)
     assert (ft.name == 'name')
     assert (ft.interval == Interval(0, 1))
     assert (ft.mark is None)