Esempio n. 1
0
    def read(self, filename):
        """
        Read an Xtrans file and fill the Transcription.
        It creates a tier for each speaker-channel observed in the file.

        """
        with codecs.open(filename, 'r', 'utf-8') as fp:
            lines = fp.readlines()

            rownames = lines[0].split('\t')
            lines.pop(0)
            medias = {}

            # Extract rows, create tiers and metadata.
            for line in lines:

                # a comment
                if line.startswith(';;'):
                    continue

                # a tab-delimited line
                line = line.split('\t')

                # fix the name of the tier
                channel = line[rownames.index('channel;int')]
                speaker = line[rownames.index('speaker;unicode')]
                tiername = speaker+'-'+channel

                # check for the tier (find it or create it)
                tier = self.Find(tiername)
                if tier is None:
                    tier = Tier(tiername)
                    mediaurl = line[rownames.index('file;unicode')]
                    if not mediaurl in medias:
                        mediaid = gen_id()
                        medias[mediaurl] = mediaid
                    mediaid = medias[mediaurl]
                    (mediamime,mediaencoding) = mimetypes.guess_type(mediaurl)
                    media = Media( mediaid, mediaurl, mediamime )
                    if mediaencoding is not None:
                        media.metadata[ "encoding" ] = mediaencoding

                    tier.SetMedia( media )
                    tier.metadata[ "speakerName" ]    = speaker
                    tier.metadata[ "speakerType" ]    = line[ rownames.index('speakerType;unicode') ]
                    tier.metadata[ "speakerDialect" ] = line[ rownames.index('speakerDialect;unicode') ]
                    tier.metadata[ "mediaChannel" ]   = channel
                    self.Append( tier )

                # Add the new annotation
                label = Label( line[rownames.index('transcript;unicode')] )
                begin = TimePoint( float( line[rownames.index('start;float')] ) )
                end   = TimePoint( float( line[rownames.index('end;float')] ) )
                new_ann = Annotation(TimeInterval(begin,end), label)
                tier.Add( new_ann )
Esempio n. 2
0
 def __read_media(self, mediaRoot):
     # Create a Media instance
     mediaid   = gen_id()
     mediaurl  = mediaRoot.attrib['MEDIA_URL']
     mediamime = ''
     if 'MIME_TYPE' in mediaRoot.attrib:
         mediamime = mediaRoot.attrib['MIME_TYPE']
     media = Media( mediaid,mediaurl,mediamime )
     # Add metadata
     if 'RELATIVE_MEDIA_URL' in mediaRoot.attrib:
         media.metadata['RELATIVE_MEDIA_URL'] = mediaRoot.attrib['RELATIVE_MEDIA_URL']
     # Add media into Transcription();
     # but media not linked to tiers... Elan doesn't propose it
     self.AddMedia( media )
Esempio n. 3
0
    def __read_media(self, mediaRoot):
        # Create a Media instance
        mediaid   = mediaRoot.attrib['id']
        mediaurl  = mediaRoot.attrib['url']
        mediamime = ''
        if 'mimetype' in mediaRoot.attrib:
            mediamime = mediaRoot.attrib['mimetype']
        media = Media( mediaid,mediaurl,mediamime )

        # Add content if any
        contentRoot = mediaRoot.find('Content')
        if contentRoot:
            media.content = contentRoot.text

        # link to tiers
        for tierNode in mediaRoot.findall('Tier'):
            tier = self.__id_tier_map[tierNode.attrib['id']]
            tier.SetMedia( media )