Esempio n. 1
0
class TestSegmentAnnotationCmp(object):
    sa1 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(0, 0.5), None)])
    sa2 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(0.5, 1.5), None)])
    sa3 = SegmentAnnotation('n1',
                            [FragmentToken('n1', Interval(1.3, 1.4), None)])
    sa4 = SegmentAnnotation('n2', [FragmentToken('n2', Interval(0, 1), None)])

    def test_invalid_comparison(self):
        with pytest.raises(ValueError):
            annotation_cmp(self.sa1, self.sa4)

    def test_annotation_eq(self):
        assert (annotation_cmp(self.sa1, self.sa1) == 0)
        assert (annotation_cmp(self.sa2, self.sa2) == 0)
        assert (annotation_cmp(self.sa3, self.sa3) == 0)

    def test_annotation_cmp(self):
        assert (annotation_cmp(self.sa1, self.sa2) == -1)
        assert (annotation_cmp(self.sa1, self.sa3) == -1)
        assert (annotation_cmp(self.sa2, self.sa1) == 1)
        assert (annotation_cmp(self.sa3, self.sa1) == 1)

        assert (annotation_cmp(self.sa2, self.sa3) == 0)
        assert (annotation_cmp(self.sa3, self.sa2) == 0)
Esempio n. 2
0
 def test_not_enough_overlap(self):
     i1 = Interval(0, 1)
     i2 = Interval(0.98, 2)
     assert (not i1.overlaps_with(i2))
     assert (not i2.overlaps_with(i1))
     assert (interval_cmp(i1, i2) == -1)
     assert (interval_cmp(i2, i1) == 1)
Esempio n. 3
0
 def test_badinterval(self):
     with pytest.raises(ValueError):
         Interval(1, 0)
     with pytest.raises(ValueError):
         Interval(-1, -0.5)
     with pytest.raises(ValueError):
         Interval(-2, -3)
Esempio n. 4
0
 def test_interval_errors(self):
     assert (check_intervals(self.d2, self.m1) == ([
         FragmentToken('a', Interval(0.5, 1.5), 'm1')
     ], []))
     assert (check_intervals(self.d4, self.m1) == ([
         FragmentToken('a', Interval(0.5, 2.5), 'm1')
     ], []))
Esempio n. 5
0
 def test_annotation_at_interval(self):
     assert (self.sa.annotation_at_interval(Interval(0.0, 0.5)) == tuple(
         ['a', 'r', 'm', 's', 'a']))
     assert (self.sa.annotation_at_interval(Interval(0.1, 0.4)) == tuple(
         ['r', 'm', 's']))
     assert (self.sa.annotation_at_interval(Interval(0.0,
                                                     0.05)) == tuple(['a']))
     assert (self.sa.annotation_at_interval(Interval(10, 11)) == tuple())
Esempio n. 6
0
 def test_eq_wrong_ntokens(self):
     sa1 = SegmentAnnotation('name1',
                             [FragmentToken('', Interval(0, 2), None)])
     sa2 = SegmentAnnotation('name1', [
         FragmentToken('', Interval(0, 1), None),
         FragmentToken('', Interval(1, 2), None)
     ])
     assert (sa1 != sa2)
Esempio n. 7
0
 def test_truncate_interval(self):
     assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({
         ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                          ('c', 'd')), )
     }), [], []))
     assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({
         ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                          ('c', 'd')), )
     }), [], []))
Esempio n. 8
0
 def test_tokens_at_interval(self):
     assert (self.sa.tokens_at_interval(Interval(0.0, 0.5)) == tuple(
         self.tokenlist))
     assert (self.sa.tokens_at_interval(Interval(0.1, 0.4)) == tuple(
         self.tokenlist[1:4]))
     assert (self.sa.tokens_at_interval(Interval(
         0.0, 0.05)) == (self.tokenlist[0], ))
     assert (self.sa.tokens_at_interval(Interval(10, 11)) == tuple())
     assert (SegmentAnnotation('', []).tokens_at_interval(Interval(
         0, 1)) == tuple())
Esempio n. 9
0
def load(phndir, wrddir, outdir, prefix):
    fragments = load_filesets(phndir, wrddir)
    phn_fragments, wrd_fragments = zip(*fragments)
    print(len(phn_fragments), len(wrd_fragments))
    for i in range(len(phn_fragments)):
        if phn_fragments[i] == []:
            print(i)
    # remove "sil", "sp"
    #phn_fragments = [[f for f in fl if not f.mark in ['SIL', 'sp']] #'__#__',
    #                 for fl in phn_fragments]
    #wrd_fragments = [[f for f in fl if not f.mark in ['SIL', 'sp']] #'__SIL__' ,
    #                 for fl in wrd_fragments]

    #If i remove SIL, the script stops working
    intervals_from_phn = {
        fl[0].name: Interval(fl[0].interval.start, fl[-1].interval.end)
        for fl in phn_fragments
    }
    intervals_from_wrd = {
        fl[0].name: Interval(fl[0].interval.start, fl[-1].interval.end)
        for fl in wrd_fragments
    }
    # check that the total file intervals match up
    #print len(intervals_from_phn), len(intervals_from_wrd)
    assert (len(intervals_from_phn) == len(intervals_from_wrd))

    #print len(wrd_fragments)
    #print wrd_fragments[0]
    # check that each word corresponds to a sequence of phones exactly
    phn_corpus = tokenlists_to_corpus(phn_fragments)
    wrd_corpus = tokenlists_to_corpus(wrd_fragments)
    # (will raise exception if exact match is not found)
    (phn_corpus.tokens_exact(name, interval)
     for name, interval, mark in wrd_corpus.iter_fragments())

    # write concatenated phn, wrd files
    with open(path.join(outdir, prefix + '.phn'), 'w') as fp:
        for fragment in sorted(chain.from_iterable(phn_fragments),
                               key=lambda x: (x.name, x.interval.start)):
            fp.write('{0} {1:.2f} {2:.2f} {3}\n'.format(
                fragment.name, fragment.interval.start, fragment.interval.end,
                fragment.mark))
    with open(path.join(outdir, prefix + '.wrd'), 'w') as fp:
        for fragment in sorted(chain.from_iterable(wrd_fragments),
                               key=lambda x: (x.name, x.interval.start)):
            fp.write('{0} {1:.2f} {2:.2f} {3}\n'.format(
                fragment.name, fragment.interval.start, fragment.interval.end,
                fragment.mark))
    with open(path.join(outdir, prefix + '.split'), 'w') as fp:
        for name, interval in sorted(intervals_from_phn.iteritems()):
            fp.write('{0} {1:.2f} {2:.2f}\n'.format(name, interval.start,
                                                    interval.end))

    return phn_fragments, wrd_fragments
Esempio n. 10
0
 def test_ca_intervals(self):
     exp_intervals = {
         'a': [Interval(0.0, 0.5), Interval(0.7, 1.3)],
         'b': [Interval(0.1, 0.6)]
     }
     pred_intervals = {}
     for fname in self.ca.keys():
         intervals = [
             fa.interval for fa in self.ca.segment_annotations[fname]
         ]
         pred_intervals[fname] = intervals
     assert (exp_intervals == pred_intervals)
def load(phndir, wrddir, outdir):
    fragments = load_filesets(phndir, wrddir)
    phn_fragments, wrd_fragments = zip(*fragments)

    # remove "sil", "sp", "SIL"
    phn_fragments = [[f for f in fl if not f.mark in ['sil', 'sp', 'SIL']]
                     for fl in phn_fragments]
    wrd_fragments = [[f for f in fl if not f.mark in ['sil', 'sp', 'SIL']]
                     for fl in wrd_fragments]

    intervals_from_phn = {
        fl[0].name: Interval(fl[0].interval.start, fl[-1].interval.end)
        for fl in phn_fragments
    }
    intervals_from_wrd = {
        fl[0].name: Interval(fl[0].interval.start, fl[-1].interval.end)
        for fl in wrd_fragments
    }

    # check that the total file intervals match up
    assert (intervals_from_phn == intervals_from_wrd)

    # check that each word corresponds to a sequence of phones exactly
    wrd_corpus = tokenlists_to_corpus(wrd_fragments)
    phn_corpus = tokenlists_to_corpus(phn_fragments)

    # (will raise exception if exact match is not found)
    (phn_corpus.tokens_exact(name, interval)
     for name, interval, mark in wrd_corpus.iter_fragments())

    # write concatenated phn, wrd files
    with open(path.join(outdir, '{}.phn'.format(CORPUS)), 'w') as fp:
        for fragment in sorted(chain.from_iterable(phn_fragments),
                               key=lambda x: (x.name, x.interval.start)):
            fp.write(u'{0} {1:.4f} {2:.4f} {3}\n'.format(
                fragment.name, fragment.interval.start, fragment.interval.end,
                fragment.mark))

    with open(path.join(outdir, '{}.wrd'.format(CORPUS)), 'w') as fp:
        for fragment in sorted(chain.from_iterable(wrd_fragments),
                               key=lambda x: (x.name, x.interval.start)):
            fp.write(u'{0} {1:.4f} {2:.4f} {3}\n'.format(
                fragment.name, fragment.interval.start, fragment.interval.end,
                fragment.mark))

    with open(path.join(outdir, '{}.split'.format(CORPUS)), 'w') as fp:
        for name, interval in sorted(intervals_from_phn.iteritems()):
            fp.write(u'{0} {1:.4f} {2:.4f}\n'.format(name, interval.start,
                                                     interval.end))

    return phn_fragments, wrd_fragments
Esempio n. 12
0
def test_typeset():
    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n1)),
              FragmentToken(None, Interval(0, 1), 'n{0}'.format(n2)))
             for n1, n2 in zip(xrange(10), xrange(10, 20))]
    assert (set(list(
        typeset(pairs))) == set(['m{0}'.format(n) for n in xrange(10)] +
                                ['n{0}'.format(n) for n in xrange(10, 20)]))
    pairs = []
    assert (set(list(typeset(pairs))) == set())

    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)),
              FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)))
             for n in xrange(10)]
    assert (set(typeset(pairs)) == set('m{0}'.format(n) for n in xrange(10)))
Esempio n. 13
0
class TestCheckTruncateIntervals(object):
    m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]})
    d1 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )})
    d2 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )})
    d3 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )})
    d4 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )})
    sa = [
        SegmentAnnotation('a', [
            FragmentToken('a', Interval(0.0, 0.25), 'a'),
            FragmentToken('a', Interval(0.25, 0.5), 'b'),
            FragmentToken('a', Interval(0.5, 0.75), 'c'),
            FragmentToken('a', Interval(0.75, 1.0), 'd')
        ])
    ]
    ca = Corpus(sa)

    def test_good_interval(self):
        assert (truncate_intervals(self.d1, self.ca,
                                   self.m1) == (self.d1, [], []))

    def test_truncate_interval(self):
        assert (truncate_intervals(self.d2, self.ca, self.m1) == (ClassDict({
            ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                             ('c', 'd')), )
        }), [], []))
        assert (truncate_intervals(self.d4, self.ca, self.m1) == (ClassDict({
            ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.0),
                                             ('c', 'd')), )
        }), [], []))
Esempio n. 14
0
def test_freqs():
    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n1)),
              FragmentToken(None, Interval(0, 1), 'n{0}'.format(n2)))
             for n1, n2 in zip(xrange(10), xrange(10, 20))]
    assert (freqs(pairs) == dict({'m{0}'.format(n): 1
                                  for n in xrange(10)}.items() +
                                 {'n{0}'.format(n): 1
                                  for n in xrange(10, 20)}.items()))
    pairs = []
    assert (freqs(pairs) == dict())

    pairs = [(FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)),
              FragmentToken(None, Interval(0, 1), 'm{0}'.format(n)))
             for n in xrange(10)]
    assert (freqs(pairs) == {'m{0}'.format(n): 1 for n in xrange(10)})
Esempio n. 15
0
def pairwise_substring_completion(fragment1, fragment2, corpus, minlength,
                                  maxlength):
    name1, name2 = fragment1.name, fragment2.name
    tokenseq1 = [(f.mark, f.interval)
                 for f in corpus.tokens(name1, fragment1.interval)]
    tokenseq2 = [(f.mark, f.interval)
                 for f in corpus.tokens(name2, fragment2.interval)]

    for seq1, seq2 in psubstrings(tokenseq1, tokenseq2, minlength, maxlength):
        submark1, intervalseq1 = zip(*seq1)
        submark2, intervalseq2 = zip(*seq2)
        interval1 = Interval(intervalseq1[0].start, intervalseq1[-1].end)
        interval2 = Interval(intervalseq2[0].start, intervalseq2[-1].end)
        yield (FragmentToken(name1, interval1, submark1),
               FragmentToken(name2, interval2, submark2))
Esempio n. 16
0
def collapse(intervals):
    """
    Compute the union of a list of intervals.

    The union of intervals is defined as the set-theoretic union
    for intervals that overlap and concatenation for intervals that don't.

    Parameters
    ----------
    intervals : list of Intervals

    Returns
    -------
    list of Intervals

    """
    intervals = sorted(intervals, key=lambda x: x.start)
    nodes = [Node(i) for i in intervals]
    for i in xrange(len(intervals)):
        for j in xrange(i+1, len(intervals)):
            i1 = intervals[i]
            i2 = intervals[j]
            if not i2.is_right_adjacent_to(i1) and i2.start > i1.end:
                break
            if i1.overlap(i2) > 0 or i1.is_adjacent(i2):
                nodes[i].add_link(nodes[j])
    r = []
    for c in connected(nodes):
        starts, ends = zip(*(node.value for node in c))
        r.append(Interval(min(starts), max(ends)))
    return sorted(r, key=lambda x: x.start)
Esempio n. 17
0
def truncate_intervals(clsdict, corpus, mapping):
    disc = {}
    interval_errors = []
    filename_errors = []
    for class_id in clsdict:
        fragments = []
        for fragment in clsdict[class_id]:
            qname = fragment.name
            qstart = fragment.interval.start
            qend = fragment.interval.end
            try:
                finterval = mapping.largest_overlap(qname, fragment.interval)
            except KeyError:
                filename_errors.append(fragment.name)
                continue
            except ValueError:
                interval_errors.append(fragment)
            fstart, fend = finterval
            if qstart != fstart or qend != fend:
                newstart = max(qstart, fstart)
                newend = min(qend, fend)
                newinterval = Interval(newstart, newend)
                newmark = corpus.annotation(qname, newinterval)
                fragment = FragmentToken(qname, newinterval, newmark)
            fragments.append(fragment)
        disc[class_id] = tuple(fragments)
    return ClassDict(disc), filename_errors, interval_errors
Esempio n. 18
0
def load_alignment(fname, strip_tags=True):
    """Loads a .ctm alignment file into FragmentTokens."""
    fragment_lists = []
    fragments = []
    previous_name = ""
    
    for line in open(fname):
        name, _, start, duration, mark = line.strip().split(' ')
        if name != previous_name:
            if fragments != []:
                fragment_lists.append(fragments)
            fragments = []
        previous_name = name
        start = round(float(start), 2)
        stop = start + round(float(duration), 2)
        interval = Interval(start, stop)

        if "phone" in fname and strip_tags:
            mark = mark.split('_')[0]
        
        fragment = FragmentToken(name, interval, mark)
        fragments.append(fragment)
    if fragments != []:
        fragment_lists.append(fragments)
        
    # Phone and word alignments aren't necessarily in the same order, so sort.
    fragment_lists.sort()
    return fragment_lists
Esempio n. 19
0
def annotate_classes(clsdict, corpus, split=None):
    new = {}  # with annotation
    errors = []
    check_split = not (split is None)
    for classID, tokenlist in clsdict.iteritems():
        newtokens = []
        for token in tokenlist:
            filename = token.name
            interval = token.interval
            if check_split and not split.is_covered(filename, interval):
                errors.append(token)
                try:
                    finterval = split.largest_overlap(filename, interval)
                    qstart, qend = interval
                    fstart, fend = finterval
                    if fstart != qstart or fstart != qend:
                        newstart = max(fstart, qstart)
                        newend = min(fend, qend)
                        interval = Interval(newstart, newend)
                except KeyError:
                    continue
                except ValueError:
                    continue
            try:
                annot = tuple(corpus.annotation(filename, interval))
            except:
                continue
            newtokens.append(FragmentToken(filename, interval, annot))
        if len(newtokens) > 0:
            newtokens = tuple(newtokens)
            new[classID] = newtokens
    return ClassDict(new), errors
Esempio n. 20
0
def split_em(phn_fragments, outdir):
    intervals = {
        f[0].name: Interval(f[0].interval.start, f[-1].interval.end)
        for f in phn_fragments
    }

    names_cross = list(grouper(1000, random.sample(intervals.items(), 4000)))
    intervals_per_speaker = defaultdict(set)
    for fname, interval in intervals.iteritems():
        intervals_per_speaker[fname.split('_')[2]].add((fname, interval))
    names_within = [
        list(v) for v in intervals_per_speaker.values() if len(v) > 200
    ]

    with open(path.join(outdir, 'xitsonga.intervals.cross'), 'w') as fp:
        fp.write('\n\n'.join('\n'.join(
            '{0} {1:.2f} {2:.2f}'.format(name, interval.start, interval.end)
            for name, interval in sorted(ns)) for ns in names_cross))

    with open(path.join(outdir, 'xitsonga.intervals.within'), 'w') as fp:
        fp.write('\n\n'.join('\n'.join(
            '{0} {1:.2f} {2:.2f}'.format(name, interval.start, interval.end)
            for name, interval in sorted(ns)) for ns in names_within))
        # fp.write('\n\n'.join('\n'.join(sorted(ns)) for ns in names_within))

    fnames = list(set(f[0].name for f in phn_fragments))
    with open(path.join(outdir, 'xitsonga.files'), 'w') as fp:
        fp.write('\n'.join(sorted(fnames)))
Esempio n. 21
0
 def test_find(self):
     assert (list(self.m.find('a', self.q1)) == [Interval(0.0, 1.0)])
     assert (list(self.m.find('a', self.q2)) == [Interval(0.0, 1.0)])
     assert (list(self.m.find('a', self.q3)) == [Interval(0.0, 1.0)])
     assert (list(self.m.find(
         'a', self.q4)) == [Interval(0.0, 1.0),
                            Interval(2.0, 3.0)])
     assert (list(self.m.find('a', self.q5)) == [])
     assert (list(self.m.find('a', self.q6)) == [])
     assert (list(self.m.find('a', self.q7)) == [Interval(2.0, 3.0)])
     assert (list(self.m.find(
         'a', self.q8)) == [Interval(2.0, 3.0),
                            Interval(4.0, 5.0)])
     assert (list(self.m.find(
         'a', self.q9)) == [Interval(2.0, 3.0),
                            Interval(4.0, 5.0)])
     assert (list(self.m.find('a', self.q10)) == [])
def load_annot(fname):
    fs = []
    bname = path.splitext(path.basename(fname))[0]
    for line in open(fname):
        start, stop, mark = line.strip().split(' ')
        interval = Interval(round(float(start), 2), round(float(stop), 2))
        fragment = FragmentToken(bname, interval, mark)
        fs.append(fragment)
    return fs
Esempio n. 23
0
class TestFragmentType(object):
    tokens = [
        FragmentToken('a', Interval(0.0, 0.1), 'a'),
        FragmentToken('a', Interval(0.1, 0.2), 'r'),
        FragmentToken('a', Interval(0.2, 0.3), 'm'),
        FragmentToken('a', Interval(0.3, 0.4), 's'),
        FragmentToken('a', Interval(0.4, 0.5), 'a')
    ]

    def test_mark(self):
        ft = FragmentType(self.tokens, 'markymark')
        assert (ft.tokens == self.tokens)
        assert (ft.mark == 'markymark')

    def test_no_mark(self):
        ft = FragmentType(self.tokens, None)
        assert (ft.tokens == self.tokens)
        assert (ft.mark is None)
Esempio n. 24
0
 def __init__(self, name, tokens):
     self.name = name
     self.tokens = SortedList(tokens, key=cmp_to_key(token_cmp))
     if len(self.tokens) == 0:
         self.interval = None
     else:
         self.interval = Interval(self.tokens[0].interval.start,
                                  self.tokens[-1].interval.end)
     if not all(t1.interval.end == t2.interval.start
                for t1, t2 in zip(self.tokens[:-1], self.tokens[1:])):
         raise ValueError('Non-contiguous tokens.')
Esempio n. 25
0
 def test_not_enough_overlap(self):
     i1 = Interval(0, 1)
     i2 = Interval(0.98, 2)
     assert (not i1.overlaps_with(i2))
     assert (not i2.overlaps_with(i1))
     assert (interval_cmp(i1, i2) == -1)
     assert (interval_cmp(i2, i1) == 1)
Esempio n. 26
0
def split_em(phn_fragments, outdir, prefix):
    intervals = {
        f[0].name: Interval(f[0].interval.start, f[-1].interval.end)
        for f in phn_fragments
    }

    #print len(intervals) #'70': [0.0,1.49]
    size = len(phn_fragments)
    print(size)
    names_cross = list(
        grouper(size / 10, random.sample(intervals.items(),
                                         size)))  #1000 / 4000
    print len(names_cross), len(names_cross[1])
    intervals_per_speaker = defaultdict(set)
    #print(intervals)
    for fname, interval in intervals.iteritems():
        #print intervals_per_speaker.values()
        #fname = #fname.split("_")[0]#'single'
        if prefix == "mboshi":
            intervals_per_speaker[fname.split("_")[0]].add((fname, interval))
        else:
            intervals_per_speaker[fname].add((fname, interval))
    names_within = [list(v) for v in intervals_per_speaker.values()]
    #if len(v) > 2] this makes intervals.within be empty if there are no speaker information
    #print (len(names_cross[-1])), len(names_cross[0])

    names_cross[-1] = [
        element for element in names_cross[-1] if element != None
    ]
    #names_cross = names_cross[:-1]
    with open(path.join(outdir, prefix + '.intervals.cross'), 'w') as fp:
        fp.write('\n\n'.join('\n'.join(
            '{0} {1:.2f} {2:.2f}'.format(name, interval.start, interval.end)
            for name, interval in sorted(ns)) for ns in names_cross))
        #fp.write('\n')

    #print len(names_within), len(names_within[0])
    #print len(names_within)
    #print sorted(names_within[0])

    with open(path.join(outdir, prefix + '.intervals.within'), 'w') as fp:
        fp.write('\n\n'.join('\n'.join(
            '{0} {1:.2f} {2:.2f}'.format(name, interval.start, interval.end)
            for name, interval in sorted(ns)) for ns in names_within))
        #fp.write('\n\n'.join('\n'.join(sorted(ns[0][0])) for ns in names_within))
        fp.write('\n')

    fnames = list(set(f[0].name for f in phn_fragments))
    print len(fnames), len(sorted(fnames))
    with open(path.join(outdir, prefix + '.files'), 'w') as fp:
        fp.write('\n'.join(sorted(fnames)))
        fp.write('\n')
Esempio n. 27
0
class TestTokenCmp(object):
    f1 = FragmentToken('a', Interval(0.0, 0.5), None)
    f2 = FragmentToken('a', Interval(0.5, 1.5), None)
    f3 = FragmentToken('a', Interval(1.3, 1.4), None)
    f4 = FragmentToken('b', Interval(0, 1), None)

    def test_invalid_comparison(self):
        with pytest.raises(ValueError):
            token_cmp(self.f1, self.f4)

    def test_token_eq(self):
        assert (token_cmp(self.f1, self.f1) == 0)
        assert (token_cmp(self.f2, self.f2) == 0)
        assert (token_cmp(self.f3, self.f3) == 0)

    def test_token_cmp(self):
        assert (token_cmp(self.f1, self.f2) == -1)
        assert (token_cmp(self.f1, self.f3) == -1)
        assert (token_cmp(self.f2, self.f1) == 1)
        assert (token_cmp(self.f3, self.f1) == 1)

        assert (token_cmp(self.f2, self.f3) == 0)
        assert (token_cmp(self.f3, self.f2) == 0)
Esempio n. 28
0
    def test_restrict(self):
        db1 = IntervalDB({
            'a': [Interval(0, 1)],
            'b': [Interval(0, 3)],
            'c': [Interval(0, 3)]
        })
        assert (self.c1.restrict(db1) == self.c1)
        assert (self.c2.restrict(db1) == self.c2)
        assert (self.c2.restrict(db1, remove_singletons=True) == ClassDict({}))
        assert (self.c3.restrict(db1) == ClassDict({}))
        assert (self.c4.restrict(db1) == self.c4)

        db2 = IntervalDB({'a': [Interval(0, 1)], 'c': [Interval(0, 3)]})
        assert (self.c1.restrict(db2) == self.c2)
        assert (self.c2.restrict(db2) == self.c2)
        assert (self.c2.restrict(db2, remove_singletons=True) == ClassDict({}))
        assert (self.c3.restrict(db2) == ClassDict({}))
        assert (self.c4.restrict(db2) == ClassDict({
            self.id0: (self.tokens[0], self.tokens[2]),
            self.id1: (self.tokens[4], )
        }))
        assert (self.c4.restrict(db2, remove_singletons=True) == ClassDict(
            {self.id0: (self.tokens[0], self.tokens[2])}))
Esempio n. 29
0
def load_match_file(match_fn, phn_corpus):
    with open(match_fn) as f:
        matches = []
        for line in f:
            # if len(matches) > 5000:
            #     break
            fields = line.strip().split()
            if len(fields) == 2:
                base1, base2 = fields
            elif len(fields) == 6:
                dtw = float(fields[4])
                start1, end1, start2, end2 = map(
                    lambda x: float(x) / 100.0, fields[:4])
                interval1 = Interval(start1, end1)
                interval2 = Interval(start2, end2)
                fragment1 = FragmentToken(
                    base1, interval1, phn_corpus.annotation(base1, interval1))
                fragment2 = FragmentToken(
                    base2, interval2, phn_corpus.annotation(base2, interval2))
                matches.append(Match(fragment1, fragment2, dtw))

        random.shuffle(matches)
        return matches[:100000]
Esempio n. 30
0
def extract_single(tokens1, tokens2, minlength, maxlength, same):
    """Extract gold alignments between two phone lists.

    Parameters
    ----------
    tokens1, tokens2 : list of FragmentTokens
    minlength : int
        Minimum number of symbols in a fragment
    same : boolean
        Whether `tokens1` and `tokens2` are identical.

    Returns
    -------
    l : list of (FragmentToken, FragmentToken)
        List of token pairs containing the cooccurring fragments

    """
    ids1, intervals1, phones1 = zip(*tokens1)
    ids2, intervals2, phones2 = zip(*tokens2)
    id1 = ids1[0]  # ids are all the same
    id2 = ids2[0]
    css = allcommonsubstrings(phones1, phones2,
                              minlength=minlength, maxlength=maxlength,
                              same=same)
    if css is None:
        return []
    r = []
    for slice1, slice2 in css:
        r.append((FragmentToken(id1,
                                Interval(intervals1[slice1.start].start,
                                         intervals1[slice1.stop - 1].end),
                                phones1[slice1]),
                  FragmentToken(id2,
                                Interval(intervals2[slice2.start].start,
                                         intervals2[slice2.stop - 1].end),
                                phones2[slice2])))
    return r
Esempio n. 31
0
class TestCheckIntervals(object):
    m1 = IntervalDB({'a': [(0.0, 1.0), (2.0, 3.0)]})
    d1 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.0, 1.0), 'm1'), )})
    d2 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 1.5), 'm1'), )})
    d3 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('b', Interval(0.0, 1.0), 'm1'), )})
    d4 = ClassDict(
        {ClassID(0, 'm1'): (FragmentToken('a', Interval(0.5, 2.5), 'm1'), )})

    def test_good_interval(self):
        assert (check_intervals(self.d1, self.m1) == ([], []))

    def test_interval_errors(self):
        assert (check_intervals(self.d2, self.m1) == ([
            FragmentToken('a', Interval(0.5, 1.5), 'm1')
        ], []))
        assert (check_intervals(self.d4, self.m1) == ([
            FragmentToken('a', Interval(0.5, 2.5), 'm1')
        ], []))

    def test_bad_filename(self):
        assert (check_intervals(self.d3, self.m1) == ([], ['b']))