def read(self, filename): """ Read an Xtrans file and fill the Transcription. It creates a tier for each speaker-channel observed in the file. """ with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() rownames = lines[0].split('\t') lines.pop(0) medias = {} # Extract rows, create tiers and metadata. for line in lines: # a comment if line.startswith(';;'): continue # a tab-delimited line line = line.split('\t') # fix the name of the tier channel = line[rownames.index('channel;int')] speaker = line[rownames.index('speaker;unicode')] tiername = speaker+'-'+channel # check for the tier (find it or create it) tier = self.Find(tiername) if tier is None: tier = Tier(tiername) mediaurl = line[rownames.index('file;unicode')] if not mediaurl in medias: mediaid = gen_id() medias[mediaurl] = mediaid mediaid = medias[mediaurl] (mediamime,mediaencoding) = mimetypes.guess_type(mediaurl) media = Media( mediaid, mediaurl, mediamime ) if mediaencoding is not None: media.metadata[ "encoding" ] = mediaencoding tier.SetMedia( media ) tier.metadata[ "speakerName" ] = speaker tier.metadata[ "speakerType" ] = line[ rownames.index('speakerType;unicode') ] tier.metadata[ "speakerDialect" ] = line[ rownames.index('speakerDialect;unicode') ] tier.metadata[ "mediaChannel" ] = channel self.Append( tier ) # Add the new annotation label = Label( line[rownames.index('transcript;unicode')] ) begin = TimePoint( float( line[rownames.index('start;float')] ) ) end = TimePoint( float( line[rownames.index('end;float')] ) ) new_ann = Annotation(TimeInterval(begin,end), label) tier.Add( new_ann )
def __read_media(self, mediaRoot): # Create a Media instance mediaid = gen_id() mediaurl = mediaRoot.attrib['MEDIA_URL'] mediamime = '' if 'MIME_TYPE' in mediaRoot.attrib: mediamime = mediaRoot.attrib['MIME_TYPE'] media = Media( mediaid,mediaurl,mediamime ) # Add metadata if 'RELATIVE_MEDIA_URL' in mediaRoot.attrib: media.metadata['RELATIVE_MEDIA_URL'] = mediaRoot.attrib['RELATIVE_MEDIA_URL'] # Add media into Transcription(); # but media not linked to tiers... Elan doesn't propose it self.AddMedia( media )
def __read_media(self, mediaRoot): # Create a Media instance mediaid = mediaRoot.attrib['id'] mediaurl = mediaRoot.attrib['url'] mediamime = '' if 'mimetype' in mediaRoot.attrib: mediamime = mediaRoot.attrib['mimetype'] media = Media( mediaid,mediaurl,mediamime ) # Add content if any contentRoot = mediaRoot.find('Content') if contentRoot: media.content = contentRoot.text # link to tiers for tierNode in mediaRoot.findall('Tier'): tier = self.__id_tier_map[tierNode.attrib['id']] tier.SetMedia( media )