Beispiel #1
0
    def read(self, filename):
        """
        Read an Xtrans file and fill the Transcription.
        It creates a tier for each speaker-channel observed in the file.

        """
        with codecs.open(filename, 'r', 'utf-8') as fp:
            lines = fp.readlines()

            rownames = lines[0].split('\t')
            lines.pop(0)
            medias = {}

            # Extract rows, create tiers and metadata.
            for line in lines:

                # a comment
                if line.startswith(';;'):
                    continue

                # a tab-delimited line
                line = line.split('\t')

                # fix the name of the tier
                channel = line[rownames.index('channel;int')]
                speaker = line[rownames.index('speaker;unicode')]
                tiername = speaker+'-'+channel

                # check for the tier (find it or create it)
                tier = self.Find(tiername)
                if tier is None:
                    tier = Tier(tiername)
                    mediaurl = line[rownames.index('file;unicode')]
                    if not mediaurl in medias:
                        mediaid = gen_id()
                        medias[mediaurl] = mediaid
                    mediaid = medias[mediaurl]
                    (mediamime,mediaencoding) = mimetypes.guess_type(mediaurl)
                    media = Media( mediaid, mediaurl, mediamime )
                    if mediaencoding is not None:
                        media.metadata[ "encoding" ] = mediaencoding

                    tier.SetMedia( media )
                    tier.metadata[ "speakerName" ]    = speaker
                    tier.metadata[ "speakerType" ]    = line[ rownames.index('speakerType;unicode') ]
                    tier.metadata[ "speakerDialect" ] = line[ rownames.index('speakerDialect;unicode') ]
                    tier.metadata[ "mediaChannel" ]   = channel
                    self.Append( tier )

                # Add the new annotation
                label = Label( line[rownames.index('transcript;unicode')] )
                begin = TimePoint( float( line[rownames.index('start;float')] ) )
                end   = TimePoint( float( line[rownames.index('end;float')] ) )
                new_ann = Annotation(TimeInterval(begin,end), label)
                tier.Add( new_ann )
Beispiel #2
0
    def test_align(self):
        dictdir = os.path.join(SPPAS, "resources", "vocab")
        vocabfile = os.path.join(dictdir, "FR.vocab")
        tok = sppasTok(vocabfile, "FR")
        tier = Tier()
        lines = (u"pa(r)ce que j'ai euh", u"un p(e)tit peu",
                 u"[i(l)s, iz] ont pas d(e) culture", u"d'aut(re)",
                 u"(e)st-ce qu'elle a l'air bien ou pas",
                 u"p(eu)t-êt(re) moins évident",
                 u"[pa(r)ce que, passe] c'est euh", u"t(out) ça",
                 u"j'(ai) euh", u"[entre-elles, entrèl]")
        for i, line in enumerate(lines):
            a = Annotation(TimeInterval(TimePoint(i), TimePoint(i + 1)),
                           Label(line))
            tier.Append(a)

        faked, std = tok.convert(tier)
        tok.align(std, faked)

        self.assertEqual(std[0].TextValue, u"parce_que j' ai euh")
        self.assertEqual(faked[0].TextValue, u"pace_que j' ai euh")

        self.assertEqual(std[1].TextValue, u"un_petit_peu")
        self.assertEqual(faked[1].TextValue, u"un_ptit_peu")

        self.assertEqual(std[2].TextValue, u"ils_ont pas de culture")
        self.assertEqual(faked[2].TextValue, u"iz ont_pas d culture")

        self.assertEqual(std[3].TextValue, u"d'autre")
        self.assertEqual(faked[3].TextValue, u"d'aut")

        self.assertEqual(std[4].TextValue,
                         u"est-ce_qu' elle a l' air bien ou pas")
        self.assertEqual(faked[4].TextValue,
                         u"st-ce_qu' elle a l' air bien ou pas")

        self.assertEqual(std[5].TextValue, u"peut-être moins évident")
        self.assertEqual(faked[5].TextValue, u"ptêt moins évident")

        self.assertEqual(std[6].TextValue, u"parce_que c'est euh")
        self.assertEqual(faked[6].TextValue, u"passe c'est euh")

        self.assertEqual(std[7].TextValue, u"tout_ça")
        self.assertEqual(faked[7].TextValue, u"t_ça")

        self.assertEqual(std[8].TextValue, u"j' euh")
        self.assertEqual(faked[8].TextValue, u"j' euh")
Beispiel #3
0
def point2interval(tier, fixradius=None):
    """
    Convert localization.
    Ensure the radius to be always >= 1 millisecond.

    Do not convert alternatives.

    @param tier: (Tier)
    @return Tier

    """
    if tier.IsInterval():
        return tier.Copy()

    new_tier = Tier(tier.GetName())
    new_tier.metadata = tier.metadata
    new_tier.SetMedia( tier.GetMedia() )
    new_tier.SetCtrlVocab( tier.GetCtrlVocab() )
    new_tier.SetDataType( tier.GetDataType() )
    #new_tier.SetTranscription( tier.GetTranscription() ) # no need
    new_tier.metadata['TIER_TYPE']="TimePoint"

    for a in tier:
        # get point with the best score for this annotation
        point = a.GetLocation().GetPoint()
        midpoint = point.GetMidpoint()
        radius = fixradius if fixradius is not None else point.GetRadius()
        if radius < 0.001:
            radius = 0.001

        begin = TimePoint(midpoint-radius,radius)
        end   = TimePoint(midpoint+radius,radius)

        new_a=Annotation(TimeInterval(begin,end),Label(a.GetLabel().GetValue()))
        new_a.metadata = a.metadata
        new_tier.Append( new_a )

    return new_tier
Beispiel #4
0
def merge_overlapping_annotations(tier, separator=' '):
    """
    Merge overlapping annotations.
    The values of the labels are concatenated.

    Do not pay attention to alternatives.

    @param tier: (Tier)
    @return Tier

    """
    if tier.IsInterval() is False:
        return tier

    new_tier = Tier(tier.GetName())
    new_tier.metadata = tier.metadata
    new_tier.SetMedia( tier.GetMedia() )
    new_tier.SetDataType( tier.GetDataType() )
    new_tier.SetTranscription( tier.GetTranscription() )
    new_tier.SetCtrlVocab( tier.GetCtrlVocab() )
    prev = None

    for a in tier:

        if prev is None:
            new_tier.Append(a)
            prev = a
            continue

        #TW:
        # test whether prev overlaps with a
        #if prev and prev.Begin < a.End and a.Begin < prev.End:
            # Interval containing both prev and a
            #prev.TextValue += separator + a.TextValue
            #prev.EndValue = max((prev.EndValue, a.EndValue))

        if a.GetLocation().GetBegin() < prev.GetLocation().GetBegin():
            # TODO
            # it happens if more than 2 annotations are starting at the same time
            #print "IGNORED: ",a
            continue

        # a is after prev: normal.
        if a.GetLocation().GetBegin() >= prev.GetLocation().GetEnd():
            new_tier.Append(a)
            prev = a

        # prev and a start at the same time
        elif a.GetLocation().GetBegin() == prev.GetLocation().GetBegin():
            new_tier.SetCtrlVocab( None )
            # must disable CtrlVocab or, eventually, add new labels in its entries...

            if a.GetLocation().GetEnd() > prev.GetLocation().GetEnd():
                a.GetLocation().SetBegin( prev.GetLocation().GetEnd() )
                prev.GetLabel().SetValue( prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue())
                new_tier.Append(a)
                prev = a

            elif a.GetLocation().GetEnd() < prev.GetLocation().GetEnd():
                a2 = Annotation(TimeInterval(a.GetLocation().GetEnd(),prev.GetLocation().GetEnd()),Label(prev.GetLabel().GetValue()))
                prev.GetLocation().SetEnd( a.GetLocation().GetEnd() )
                prev.GetLabel().SetValue( prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue())
                new_tier.Append(a2)
                prev = a2

            else:
                prev.GetLabel().SetValue( prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue())

        # a starts inside prev
        elif a.GetLocation().GetBegin() < prev.GetLocation().GetEnd():
            new_tier.SetCtrlVocab( None )
            # must disable CtrlVocab or, eventually, add new labels in its entries...

            if a.GetLocation().GetEnd() < prev.GetLocation().GetEnd():
                a2 = Annotation(TimeInterval(a.GetLocation().GetEnd(),prev.GetLocation().GetEnd()),Label(prev.GetLabel().GetValue()))
                a.GetLabel().SetValue( a.GetLabel().GetValue() + separator + prev.GetLabel().GetValue() )
                prev.GetLocation().SetEnd( a.GetLocation().GetBegin() )
                new_tier.Append(a)
                new_tier.Append(a2)
                prev = a2

            elif a.GetLocation().GetEnd() > prev.GetLocation().GetEnd():
                a2 = Annotation(TimeInterval(a.GetLocation().GetBegin(),prev.GetLocation().GetEnd()),Label(prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue()))
                prev.GetLocation().SetEnd( a2.GetLocation().GetBegin() )
                a.GetLocation().SetBegin(  a2.GetLocation().GetEnd() )
                new_tier.Append(a2)
                new_tier.Append(a)
                prev = a

            else:
                a.GetLabel().SetValue( a.GetLabel().GetValue() + separator + prev.GetLabel().GetValue() )
                prev.GetLocation().SetEnd( a.GetLocation().GetBegin() )
                new_tier.Append(a)
                prev = a

    return new_tier