def __format_tier(self, tierRoot, tier): linguisticType = linguistic_type_from_tier(tier) tierRoot.set('LINGUISTIC_TYPE_REF', linguisticType) tierRoot.set('TIER_ID', tier.GetName()) for key in ['DEFAULT_LOCALE', 'PARTICIPANT']: if key in tier.metadata.keys(): tierRoot.set(key, tier.metadata[key]) if tier.IsPoint(): tier = point2interval(tier, ELAN_RADIUS) tier = merge_overlapping_annotations(tier) parentTier = self._hierarchy.get_parent(tier) if parentTier is not None: tierRoot.set('PARENT_REF', parentTier.GetName()) self.previousRefId = None for annotation in tier: annotationRoot = ET.SubElement(tierRoot, 'ANNOTATION') self.__format_ref_annotation(annotationRoot, annotation, parentTier) del self.previousRefId else: for annotation in tier: annotationRoot = ET.SubElement(tierRoot, 'ANNOTATION') created = self.__format_alignable_annotation(annotationRoot, annotation) if created is False: tierRoot.remove(annotationRoot)
def __build_timeslots(self): from operator import itemgetter self.timeSlotIds = OrderedDict() timeSlotIds = list() for tier in self: if tier.IsPoint(): tier = point2interval(tier, ELAN_RADIUS) tier = merge_overlapping_annotations(tier) for annotation in tier: location = annotation.GetLocation() begin = round(location.GetBeginMidpoint(), 4) end = round(location.GetEndMidpoint(), 4) timeSlotIds.append((begin, annotation)) timeSlotIds.append((end, annotation)) # sort by time values and assign the TS i = 0 for key in sorted(timeSlotIds, key=itemgetter(0)): i += 1 ts = 'ts%s' % i self.timeSlotIds[key] = ts
def writeliatxt(self, filename, t=0): """ Write an ascii file, with one tier of the Transcription. This output is the same as the output of LIA_nett script. 1 column file with tokens. Interval separated by "<s>" and "</s>" Parameters: - filename is the output file name - t is the index of the tier to write Exception: IOError Return: None """ encoding='iso8859-1' with codecs.open(filename, 'w', encoding) as fp: tier = self[t] if tier.IsEmpty(): fp.close() return if tier.IsInterval(): tier = fill_gaps(tier) tier = merge_overlapping_annotations(tier) for annotation in tier: fp.write("<s>\n") if annotation.IsInterval(): l = annotation.TextValue l = l.strip() tabl = l.split() for w in tabl: fp.write( w + "\n" ) fp.write(" ") fp.write("</s>\n")
def writecsv(self,filename): """ Write an ascii file, as csv file. Parameters: - filename is the output file name Exception: IOError, Exception Return: None """ encoding='utf-8' with codecs.open(filename, 'w', encoding) as fp: for tier in self: if tier.IsEmpty(): continue if tier.IsInterval(): tier = fill_gaps(tier) tier = merge_overlapping_annotations(tier) for annotation in tier: fp.write(' "') fp.write( tier.Name ) fp.write('";"') if annotation.IsInterval(): fp.write( str( annotation.BeginValue ) ) fp.write('";"') fp.write( str( annotation.EndValue ) ) else: fp.write( str( annotation.PointValue ) ) fp.write('";"') fp.write('";"') fp.write( annotation.TextValue ) fp.write('"\n')
def writeipulab(self, filename): """ Write an HTK lab file, segmented by IPUs. Time is represented as 100ns. Lab files are used to create MLF files; they use the following specifications: [start1 [end1]] label1 [score] {auxlabel [auxscore]} [comment] where: - [.] are optionals (0 or 1) - {.} possible repetitions (1,2,3...) Parameters: - filename is the output file name Exception: IOError, Exception Return: None """ inipu = False encoding='utf-8' with codecs.open(filename, 'w', encoding) as fp: for tier in self: if tier.IsEmpty(): continue if tier.IsInterval(): tier = fill_gaps(tier) tier = merge_overlapping_annotations(tier) for annotation in tier: if annotation.IsPoint(): __p = int(annotation.PointValue * 10000000) fp.write(str(__p)) fp.write(" ") else: __s = int(annotation.BeginValue * 10000000) __e = int(annotation.EndValue * 10000000) if annotation.IsLabel(): labstr = annotation.TextValue.strip() labstr = labstr.replace('.', ' ') tablab = labstr.split() if inipu == True: for label in tablab: fp.write(label + "\n") else: if len(tablab) < 2: fp.write(str( __s )+" ") #fp.write(str( __e )+" ") fp.write(annotation.TextValue + "\n") else: fp.write(str( __s )+" ") for label in tablab: fp.write(label + "\n") inipu = True else: fp.write(str( __s )+" ") fp.write(str( __e )+" sil\n") inipu = False
def write(self, filename, encoding='UTF-8'): """ Write an Antx file. :param filename: :param encoding: """ try: root = ET.Element('AnnotationSystemDataSet') root.set('xmlns', 'http://tempuri.org/AnnotationSystemDataSet.xsd') # Write layers for tier in self: Antx.__format_tier(root, tier) # Write segments for tier in self: if tier.IsPoint(): tier = point2interval(tier, ANTX_RADIUS) tier = merge_overlapping_annotations(tier) for ann in tier: self.__format_segment(root, tier, ann) # Write media if len(self.GetMedia()) > 0: for media in self.GetMedia(): if media: Antx.__format_media(root, media) # Write configurations for key, value in ELT_REQUIRED_Configuration.items(): Antx.__format_configuration(root, key, self.metadata.get(key, value)) for key, value in self.metadata.items(): if key not in ELT_REQUIRED_Configuration.keys(): Antx.__format_configuration(root, key, self.metadata.get(key, value)) indent(root) tree = ET.ElementTree(root) tree.write(filename, encoding=encoding, xml_declaration=True, method="xml") # TODO: add standalone="yes" in the declaration # (but not available with ElementTree) except Exception: # import traceback # print(traceback.format_exc()) raise
def writeinfo(self, filename, t=0): """ Write an ascii file, with one tier of the Transcription. An info file is a 5 columns file: begin_time end_time middle_time number duration Parameters: - filename is the output file name - t is the tier number Exception: IOError, Exception Return: None """ encoding='utf-8' with codecs.open(filename, 'w', encoding) as fp: tier = self[t] if tier.IsEmpty(): fp.close() return if tier.IsInterval(): tier = fill_gaps(tier) tier = merge_overlapping_annotations(tier) for annotation in tier: if annotation.IsInterval(): fp.write( str( annotation.BeginValue ) ) fp.write(" ") fp.write( str( annotation.EndValue ) ) fp.write(" ") duration = annotation.EndValue - annotation.BeginValue middle = annotation.BeginValue + ( duration / 2.0 ) fp.write(str(middle)) fp.write(" ") l = annotation.TextValue l = l.strip() tabl = l.split() fp.write( str ( len(tabl)) ) fp.write(" ") fp.write(str(duration)) fp.write("\n")
def __format_tier(self, tier, number): """ Format a tier from a transcription to the TextGrid format. @param number: The position of the tier in the list of all tiers. """ # Fill empty tiers because TextGrid does not support empty tiers. if tier.IsEmpty(): tier.Append(Annotation( TimeInterval(TimePoint(self.GetMinTime()), TimePoint(self.GetMaxTime())))) if tier.IsInterval(): tier = fill_gaps(tier, self.GetMinTime(), self.GetMaxTime()) tier = merge_overlapping_annotations(tier) result = ( ' item [%d]:\n' ' class = "%s"\n' ' name = "%s"\n' ' xmin = %f\n' ' xmax = %f\n' ' intervals: size = %s\n') % ( number, 'IntervalTier' if tier.IsInterval() else 'TextTier', tier.GetName(), tier.GetBeginValue(), tier.GetEndValue(), tier.GetSize()) if tier.IsInterval(): format_annotation = TextGrid.__format_interval_annotation else: format_annotation = TextGrid.__format_point_annotation for j, an in enumerate(tier, 1): result += format_annotation(an, j) return result
def __build_timeslots(self): timevalues = [] for tier in self: if tier.IsPoint(): tier = point2interval(tier,ELAN_RADIUS) tier = merge_overlapping_annotations(tier) for annotation in tier: location = annotation.GetLocation() #What about PointTiers??????? #TODO !! begin = round(location.GetBeginMidpoint(),4) end = round(location.GetEndMidpoint(),4) if not begin in timevalues: timevalues.append(begin) if not end in timevalues: timevalues.append(end) self.timeSlotIds = {} for i,v in enumerate(timevalues): self.timeSlotIds[v] = 't%s' % i