def read(self, filename): """ Read an Xtrans file and fill the Transcription. It creates a tier for each speaker-channel observed in the file. """ with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() rownames = lines[0].split('\t') lines.pop(0) medias = {} # Extract rows, create tiers and metadata. for line in lines: # a comment if line.startswith(';;'): continue # a tab-delimited line line = line.split('\t') # fix the name of the tier channel = line[rownames.index('channel;int')] speaker = line[rownames.index('speaker;unicode')] tiername = speaker+'-'+channel # check for the tier (find it or create it) tier = self.Find(tiername) if tier is None: tier = Tier(tiername) mediaurl = line[rownames.index('file;unicode')] if not mediaurl in medias: mediaid = gen_id() medias[mediaurl] = mediaid mediaid = medias[mediaurl] (mediamime,mediaencoding) = mimetypes.guess_type(mediaurl) media = Media( mediaid, mediaurl, mediamime ) if mediaencoding is not None: media.metadata[ "encoding" ] = mediaencoding tier.SetMedia( media ) tier.metadata[ "speakerName" ] = speaker tier.metadata[ "speakerType" ] = line[ rownames.index('speakerType;unicode') ] tier.metadata[ "speakerDialect" ] = line[ rownames.index('speakerDialect;unicode') ] tier.metadata[ "mediaChannel" ] = channel self.Append( tier ) # Add the new annotation label = Label( line[rownames.index('transcript;unicode')] ) begin = TimePoint( float( line[rownames.index('start;float')] ) ) end = TimePoint( float( line[rownames.index('end;float')] ) ) new_ann = Annotation(TimeInterval(begin,end), label) tier.Add( new_ann )
def test_align(self): dictdir = os.path.join(SPPAS, "resources", "vocab") vocabfile = os.path.join(dictdir, "FR.vocab") tok = sppasTok(vocabfile, "FR") tier = Tier() lines = (u"pa(r)ce que j'ai euh", u"un p(e)tit peu", u"[i(l)s, iz] ont pas d(e) culture", u"d'aut(re)", u"(e)st-ce qu'elle a l'air bien ou pas", u"p(eu)t-êt(re) moins évident", u"[pa(r)ce que, passe] c'est euh", u"t(out) ça", u"j'(ai) euh", u"[entre-elles, entrèl]") for i, line in enumerate(lines): a = Annotation(TimeInterval(TimePoint(i), TimePoint(i + 1)), Label(line)) tier.Append(a) faked, std = tok.convert(tier) tok.align(std, faked) self.assertEqual(std[0].TextValue, u"parce_que j' ai euh") self.assertEqual(faked[0].TextValue, u"pace_que j' ai euh") self.assertEqual(std[1].TextValue, u"un_petit_peu") self.assertEqual(faked[1].TextValue, u"un_ptit_peu") self.assertEqual(std[2].TextValue, u"ils_ont pas de culture") self.assertEqual(faked[2].TextValue, u"iz ont_pas d culture") self.assertEqual(std[3].TextValue, u"d'autre") self.assertEqual(faked[3].TextValue, u"d'aut") self.assertEqual(std[4].TextValue, u"est-ce_qu' elle a l' air bien ou pas") self.assertEqual(faked[4].TextValue, u"st-ce_qu' elle a l' air bien ou pas") self.assertEqual(std[5].TextValue, u"peut-être moins évident") self.assertEqual(faked[5].TextValue, u"ptêt moins évident") self.assertEqual(std[6].TextValue, u"parce_que c'est euh") self.assertEqual(faked[6].TextValue, u"passe c'est euh") self.assertEqual(std[7].TextValue, u"tout_ça") self.assertEqual(faked[7].TextValue, u"t_ça") self.assertEqual(std[8].TextValue, u"j' euh") self.assertEqual(faked[8].TextValue, u"j' euh")
def point2interval(tier, fixradius=None): """ Convert localization. Ensure the radius to be always >= 1 millisecond. Do not convert alternatives. @param tier: (Tier) @return Tier """ if tier.IsInterval(): return tier.Copy() new_tier = Tier(tier.GetName()) new_tier.metadata = tier.metadata new_tier.SetMedia( tier.GetMedia() ) new_tier.SetCtrlVocab( tier.GetCtrlVocab() ) new_tier.SetDataType( tier.GetDataType() ) #new_tier.SetTranscription( tier.GetTranscription() ) # no need new_tier.metadata['TIER_TYPE']="TimePoint" for a in tier: # get point with the best score for this annotation point = a.GetLocation().GetPoint() midpoint = point.GetMidpoint() radius = fixradius if fixradius is not None else point.GetRadius() if radius < 0.001: radius = 0.001 begin = TimePoint(midpoint-radius,radius) end = TimePoint(midpoint+radius,radius) new_a=Annotation(TimeInterval(begin,end),Label(a.GetLabel().GetValue())) new_a.metadata = a.metadata new_tier.Append( new_a ) return new_tier
def merge_overlapping_annotations(tier, separator=' '): """ Merge overlapping annotations. The values of the labels are concatenated. Do not pay attention to alternatives. @param tier: (Tier) @return Tier """ if tier.IsInterval() is False: return tier new_tier = Tier(tier.GetName()) new_tier.metadata = tier.metadata new_tier.SetMedia( tier.GetMedia() ) new_tier.SetDataType( tier.GetDataType() ) new_tier.SetTranscription( tier.GetTranscription() ) new_tier.SetCtrlVocab( tier.GetCtrlVocab() ) prev = None for a in tier: if prev is None: new_tier.Append(a) prev = a continue #TW: # test whether prev overlaps with a #if prev and prev.Begin < a.End and a.Begin < prev.End: # Interval containing both prev and a #prev.TextValue += separator + a.TextValue #prev.EndValue = max((prev.EndValue, a.EndValue)) if a.GetLocation().GetBegin() < prev.GetLocation().GetBegin(): # TODO # it happens if more than 2 annotations are starting at the same time #print "IGNORED: ",a continue # a is after prev: normal. if a.GetLocation().GetBegin() >= prev.GetLocation().GetEnd(): new_tier.Append(a) prev = a # prev and a start at the same time elif a.GetLocation().GetBegin() == prev.GetLocation().GetBegin(): new_tier.SetCtrlVocab( None ) # must disable CtrlVocab or, eventually, add new labels in its entries... if a.GetLocation().GetEnd() > prev.GetLocation().GetEnd(): a.GetLocation().SetBegin( prev.GetLocation().GetEnd() ) prev.GetLabel().SetValue( prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue()) new_tier.Append(a) prev = a elif a.GetLocation().GetEnd() < prev.GetLocation().GetEnd(): a2 = Annotation(TimeInterval(a.GetLocation().GetEnd(),prev.GetLocation().GetEnd()),Label(prev.GetLabel().GetValue())) prev.GetLocation().SetEnd( a.GetLocation().GetEnd() ) prev.GetLabel().SetValue( prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue()) new_tier.Append(a2) prev = a2 else: prev.GetLabel().SetValue( prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue()) # a starts inside prev elif a.GetLocation().GetBegin() < prev.GetLocation().GetEnd(): new_tier.SetCtrlVocab( None ) # must disable CtrlVocab or, eventually, add new labels in its entries... if a.GetLocation().GetEnd() < prev.GetLocation().GetEnd(): a2 = Annotation(TimeInterval(a.GetLocation().GetEnd(),prev.GetLocation().GetEnd()),Label(prev.GetLabel().GetValue())) a.GetLabel().SetValue( a.GetLabel().GetValue() + separator + prev.GetLabel().GetValue() ) prev.GetLocation().SetEnd( a.GetLocation().GetBegin() ) new_tier.Append(a) new_tier.Append(a2) prev = a2 elif a.GetLocation().GetEnd() > prev.GetLocation().GetEnd(): a2 = Annotation(TimeInterval(a.GetLocation().GetBegin(),prev.GetLocation().GetEnd()),Label(prev.GetLabel().GetValue() + separator + a.GetLabel().GetValue())) prev.GetLocation().SetEnd( a2.GetLocation().GetBegin() ) a.GetLocation().SetBegin( a2.GetLocation().GetEnd() ) new_tier.Append(a2) new_tier.Append(a) prev = a else: a.GetLabel().SetValue( a.GetLabel().GetValue() + separator + prev.GetLabel().GetValue() ) prev.GetLocation().SetEnd( a.GetLocation().GetBegin() ) new_tier.Append(a) prev = a return new_tier