def naiveWordAlignment( tg: textgrid.Textgrid, utteranceTierName: str, wordTierName: str, isle: isletool.Isle, phoneHelperTierName: Optional[str] = None, removeOverlappingSegments: bool = False, ) -> textgrid.Textgrid: """Performs naive alignment for utterances in a textgrid Naive alignment gives each segment equal duration. Word duration is determined by the duration of an utterance and the number of phones in the word. By 'utterance' I mean a string of words separated by a space bounded in time eg (0.5, 1.5, "he said he likes ketchup"). Args: tg: the textgrid to do alignment over utteranceTierName: name of the utterance tier to examine wordTierName: name of the word tier to create and write segments to isle: an instance of Isle phoneHelperTierName: creates a tier that is parallel to the word tier. However, the labels are the phones for the word, rather than the word removeOverlappingSegments: remove any labeled words or phones that fall under labeled utterances Returns: a modified version of the input textgrid with the word segmented Raises: WordNotInIsleError: The word was not in the Isle dictionary """ utteranceTier = tg.tierDict[utteranceTierName] wordTier = None if wordTierName in tg.tierNameList: wordTier = tg.tierDict[wordTierName] # Load in the word tier, if it exists: wordEntryList = [] phoneEntryList = [] if wordTier is not None: if removeOverlappingSegments: for start, stop, _ in utteranceTier.entryList: wordTier = wordTier.eraseRegion( start, stop, praatioConstants.EraseCollision.TRUNCATE, False) wordEntryList = wordTier.entryList # Do the naive alignment for start, stop, label in utteranceTier.entryList: wordList = label.split() # Get the list of phones in each word superPhoneList: List[List[str]] = [] numPhones = 0 i = 0 while i < len(wordList): word = wordList[i] try: entry = isle.lookup(word)[0] except errors.WordNotInIsleError: wordList.pop(i) continue superPhoneList.append(entry.phonemeList.phonemes) numPhones += len(entry.phonemeList.phonemes) i += 1 # Get the naive alignment for words, if alignment doesn't # already exist for words subWordEntryList = [] subPhoneEntryList = [] if wordTier is not None: subWordEntryList = wordTier.crop( start, stop, praatioConstants.CropCollision.TRUNCATED, False).entryList if len(subWordEntryList) == 0: wordStart = start phoneDur = (stop - start) / float(numPhones) for i, word in enumerate(wordList): phoneListTxt = " ".join(superPhoneList[i]) wordEnd = wordStart + (phoneDur * len(superPhoneList[i])) subWordEntryList.append((wordStart, wordEnd, word)) subPhoneEntryList.append((wordStart, wordEnd, phoneListTxt)) wordStart = wordEnd wordEntryList.extend(subWordEntryList) phoneEntryList.extend(subPhoneEntryList) # Replace or add the word tier newWordTier = textgrid.IntervalTier(wordTierName, wordEntryList, tg.minTimestamp, tg.maxTimestamp) if wordTier is not None: tg.replaceTier(wordTierName, newWordTier) else: tg.addTier(newWordTier) # Add the phone tier # This is mainly used as an annotation tier if phoneHelperTierName is not None and len(phoneEntryList) > 0: newPhoneTier = textgrid.IntervalTier(phoneHelperTierName, phoneEntryList, tg.minTimestamp, tg.maxTimestamp) if phoneHelperTierName in tg.tierNameList: tg.replaceTier(phoneHelperTierName, newPhoneTier) else: tg.addTier(newPhoneTier) return tg
def syllabifyTextgrid( isle: isletool.Isle, tg: textgrid.Textgrid, wordTierName: str, phoneTierName: str, skipLabelList: Optional[List[str]] = None, start: Optional[float] = None, stop: Optional[float] = None, stressDetectionErrorMode: Literal["silence", "warning", "error"] = "error", syllabificationErrorMode: Literal["silence", "warning", "error"] = "error", ) -> textgrid.Textgrid: """Given a textgrid, syllabifies the phones in the textgrid The textgrid must have a word tier (used to lookup words) and a phone tier (for syllabifying). Args: isle: an instance of Isle tg: the textgrid to syllabify wordTierName: the tier containing intervals with one word per interval phoneTierName: tier containing intervals with one phone per interval skipLabelList: intervals in the word tier containing a label in this list will be skipped start: if not None, only consider intervals that appear after the start time stop: if not None, only consider intervals that appear before the stop time stressDetectionErrorMode: determines behavior if stress is not detected for a word syllabificationErrorMode: determines behavior if a word cannot be syllabified Returns: a textgrid with only two tiers containing syllable information (syllabification of the phone tier and a tier marking word-stress). Raises: WordNotInIsleError: the word was not in the dictionary StressedSyllableDetectionError: no stress found for a word """ utils.validateOption( "stressDetectionErrorMode", stressDetectionErrorMode, constants.ErrorReportingMode, ) utils.validateOption( "syllabificationErrorMode", syllabificationErrorMode, constants.ErrorReportingMode, ) stressErrorReporter = utils.getErrorReporter(stressDetectionErrorMode) syllabificationErrorReporter = utils.getErrorReporter( syllabificationErrorMode) minT = tg.minTimestamp maxT = tg.maxTimestamp wordTier = tg.tierDict[wordTierName] phoneTier = tg.tierDict[phoneTierName] if not isinstance(wordTier, textgrid.IntervalTier): raise AttributeError(f"Tier '{wordTierName}' must be an interval tier") if not isinstance(phoneTier, textgrid.IntervalTier): raise AttributeError( f"Tier '{phoneTierName}' must be an interval tier") if skipLabelList is None: skipLabelList = [] syllableEntryList = [] tonicSEntryList = [] tonicPEntryList = [] if start is not None or stop is not None: if start is None: start = minT if stop is None: stop = maxT wordTier = wordTier.crop(start, stop, praatioConstants.CropCollision.TRUNCATED, False) for entryStart, entryStop, word in wordTier.entryList: if word in skipLabelList: continue subPhoneTier = phoneTier.crop(entryStart, entryStop, praatioConstants.CropCollision.STRICT, False) phoneList = [ entry[2] for entry in subPhoneTier.entryList if entry[2] != "" ] try: sylTmp = isle.findBestSyllabification(word, phoneList) except errors.WordNotInIsleError: print( f"Not is isle -- skipping syllabification; Word '{word}' at {entryStart:.2f}" ) continue except errors.NullPronunciationError: print( f"No provided pronunciation; Word '{word}' at {entryStart:.2f}" ) continue except errors.ImpossibleSyllabificationError as e: syllabificationErrorReporter( errors.ImpossibleSyllabificationError, f"Syllabification error; Word '{word}' at {entryStart:.2f}; " + str(e), ) continue stressI = sylTmp.stressedVowelIndicies stressJ = sylTmp.stressedSyllableIndicies syllableList = sylTmp.syllables islesAdjustedSyllableList = syllableList if len(stressI) > 0 and len(stressJ) > 0: syllableList[stressI[0]].phonemes[stressJ[0]] += "ˈ" i = 0 for k, syllable in enumerate(syllableList): # Create the syllable tier entry j = len(syllable) stubEntryList = subPhoneTier.entryList[i:i + j] i += j # The whole syllable was deleted if len(stubEntryList) == 0: continue syllableStart = stubEntryList[0][0] syllableEnd = stubEntryList[-1][1] label = "-".join([entry[2] for entry in stubEntryList]) syllableEntryList.append((syllableStart, syllableEnd, label)) # Create the tonic syllable tier entry if k == stressI: tonicSEntryList.append( (syllableStart, syllableEnd, phonetic_constants.TONIC)) # Create the tonic phone tier entry if k == stressI: syllablePhoneTier = phoneTier.crop( syllableStart, syllableEnd, praatioConstants.CropCollision.STRICT, False, ) tonicSyllableEntries: List[textgrid.constants.Interval] = [ entry for entry in syllablePhoneTier.entryList if entry[2] != "" ] tonicSyllable = phonetics.Syllable( [phone for _, _, phone in tonicSyllableEntries]) cvList = tonicSyllable.simplify().phonemes tmpStressJ = None try: tmpStressJ = cvList.index(phonetic_constants.VOWEL) except ValueError: for char in phonetic_constants.STRESS_BEARING_CONSONANTS: if char in cvList: tmpStressJ = cvList.index(char) break if tmpStressJ is None: stressErrorReporter( errors.StressedSyllableDetectionError, f"No stressed syllable; word: '{word}' at {syllableStart:.2f}, " f"actual mapped pronunciation: {syllableList}, " f"ISLE's mapped pronunciation: {islesAdjustedSyllableList}", ) continue phoneStart, phoneEnd = tonicSyllableEntries[tmpStressJ][:2] tonicPEntryList.append( (phoneStart, phoneEnd, phonetic_constants.TONIC)) # Create a textgrid with the two syllable-level tiers syllableTier = textgrid.IntervalTier("syllable", syllableEntryList, minT, maxT) tonicSTier = textgrid.IntervalTier("tonicSyllable", tonicSEntryList, minT, maxT) tonicPTier = textgrid.IntervalTier("tonicVowel", tonicPEntryList, minT, maxT) syllableTG = textgrid.Textgrid() syllableTG.addTier(syllableTier) syllableTG.addTier(tonicSTier) syllableTG.addTier(tonicPTier) return syllableTG
def save( self, output_directory: typing.Optional[str] = None, output_format: typing.Optional[str] = None, save_transcription: bool = False, ) -> None: """ Output File to TextGrid or lab. If ``text_type`` is not specified, the original file type will be used, but if there was no text file for the file, it will guess lab format if there is only one utterance, otherwise it will output a TextGrid file. Parameters ---------- output_directory: str, optional Directory to output file, if None, then it will overwrite the original file output_format: str, optional Text type to save as, if not provided, it will use either the original file type or guess the file type save_transcription: bool Flag for whether the hypothesized transcription text should be saved instead of the default text """ utterance_count = len(self.utterances) overwrite = output_format is None if overwrite: # Saving directly if (utterance_count == 1 and self.utterances[0].begin == 0 and self.utterances[0].end == self.duration): output_format = TextFileType.LAB.value else: output_format = TextFileType.TEXTGRID.value output_path = self.construct_output_path(output_directory, output_format=output_format) if overwrite: if output_path != self.text_file.text_file_path and os.path.exists( self.text_file.text_file_path): os.remove(self.text_file.text_file_path) self.text_file.file_type = output_format self.text_file.text_file_path = output_path if output_format == TextFileType.LAB.value: if (utterance_count == 0 and os.path.exists(self.text_file.text_file_path) and not save_transcription): os.remove(self.text_file.text_file_path) return elif utterance_count == 0: return for u in self.utterances: if save_transcription: with open(output_path, "w", encoding="utf8") as f: f.write(u.transcription_text if u. transcription_text else "") elif u.text: with open(output_path, "w", encoding="utf8") as f: f.write(u.text) return elif output_format == TextFileType.TEXTGRID.value: max_time = self.sound_file.duration tiers = {} for speaker in self.speakers: tiers[speaker.speaker.name] = textgrid.IntervalTier( speaker.speaker.name, [], minT=0, maxT=max_time) tg = textgrid.Textgrid() tg.maxTimestamp = max_time for utterance in self.utterances: if save_transcription: tiers[utterance.speaker.name].entryList.append( Interval( start=utterance.begin, end=utterance.end, label=utterance.transcription_text if utterance.transcription_text else "", )) else: if tiers[utterance.speaker.name].entryList: if tiers[utterance.speaker. name].entryList[-1].end > utterance.begin: utterance.begin = tiers[ utterance.speaker.name].entryList[-1].end if utterance.end > self.duration: utterance.end = self.duration tiers[utterance.speaker.name].entryList.append( Interval(start=utterance.begin, end=utterance.end, label=utterance.text.strip())) for t in tiers.values(): tg.addTier(t) tg.save(output_path, includeBlankSpaces=True, format=output_format)
def naivePhoneAlignment( tg: textgrid.Textgrid, wordTierName: str, phoneTierName: str, isle: isletool.Isle, removeOverlappingSegments: bool = False, ) -> textgrid.Textgrid: """Performs naive alignment for words in a textgrid Naive alignment gives each segment equal duration. Phone duration is determined by the duration of the word and the number of phones. Args: tg: the textgrid to do alignment over wordTierName: name of the utterance tier to examine phoneTierName: name of the word tier to create and write segments to isle: an instance of Isle removeOverlappingSegments: remove any labeled words or phones that fall under labeled utterances Returns: a modified version of the input textgrid with the word segmented Raises: WordNotInIsleError: The word was not in the Isle dictionary """ wordTier = tg.tierDict[wordTierName] phoneTier = None if phoneTierName in tg.tierNameList: phoneTier = tg.tierDict[phoneTierName] # Load in the phone tier, if it exists: phoneEntryList = [] if phoneTier is not None: if removeOverlappingSegments: for startT, stopT, _ in wordTier.entryList: phoneTier = phoneTier.eraseRegion( startT, stopT, praatioConstants.EraseCollision.TRUNCATE, False) phoneEntryList = phoneTier.entryList # Do the naive alignment for wordStartT, wordEndT, word in wordTier.entryList: # Get the list of phones in this word try: entry = isle.lookup(word)[0] except errors.WordNotInIsleError: continue phones = entry.phonemeList.stripDiacritics().phonemes # Get the naive alignment for phones, if alignment doesn't # already exist for phones subPhoneEntryList = [] if phoneTier is not None: subPhoneEntryList = phoneTier.crop( wordStartT, wordEndT, praatioConstants.CropCollision.TRUNCATED, False).entryList if len(subPhoneEntryList) == 0: phoneDur = (wordEndT - wordStartT) / len(phones) phoneStartT = wordStartT for phone in phones: phoneEndT = phoneStartT + phoneDur subPhoneEntryList.append((phoneStartT, phoneEndT, phone)) phoneStartT = phoneEndT phoneEntryList.extend(subPhoneEntryList) # Replace or add the phone tier newPhoneTier = textgrid.IntervalTier(phoneTierName, phoneEntryList, tg.minTimestamp, tg.maxTimestamp) if phoneTier is not None: tg.replaceTier(phoneTierName, newPhoneTier) else: tg.addTier(newPhoneTier) return tg
def export_textgrid( speaker_data: Dict[str, Dict[str, List[CtmInterval]]], output_path: str, duration: float, frame_shift: int, output_format: str = TextFileType.TEXTGRID.value, ) -> None: """ Export aligned file to TextGrid Parameters ---------- speaker_data: dict[Speaker, dict[str, list[:class:`~montreal_forced_aligner.data.CtmInterval`]] Per speaker, per word/phone :class:`~montreal_forced_aligner.data.CtmInterval` output_path: str Output path of the file duration: float Duration of the file frame_shift: int Frame shift of features, in ms output_format: str, optional Output format, one of: "long_textgrid" (default), "short_textgrid", "json", or "csv" """ if frame_shift > 1: frame_shift = round(frame_shift / 1000, 4) has_data = False if output_format == "csv": csv_data = [] for speaker, data in speaker_data.items(): for annotation_type, intervals in data.items(): if len(intervals): has_data = True for a in intervals: if duration - a.end < (frame_shift * 2): # Fix rounding issues a.end = duration csv_data.append({ "Begin": a.begin, "End": a.end, "Label": a.label, "Type": annotation_type, "Speaker": speaker, }) if has_data: with open(output_path, "w", encoding="utf8", newline=None) as f: writer = csv.DictWriter( f, fieldnames=["Begin", "End", "Label", "Type", "Speaker"]) writer.writeheader() for line in csv_data: writer.writerow(line) elif output_format == "json": json_data = {"start": 0, "end": duration, "tiers": {}} for speaker, data in speaker_data.items(): for annotation_type, intervals in data.items(): if len(speaker_data) > 1: tier_name = f"{speaker} - {annotation_type}" else: tier_name = annotation_type if tier_name not in json_data["tiers"]: json_data["tiers"][tier_name] = { "type": "interval", "entries": [] } if len(intervals): has_data = True for a in intervals: if duration - a.end < (frame_shift * 2): # Fix rounding issues a.end = duration json_data["tiers"][tier_name]["entries"].append( [a.begin, a.end, a.label]) if has_data: with open(output_path, "w", encoding="utf8") as f: json.dump(json_data, f) else: # Create initial textgrid tg = tgio.Textgrid() tg.minTimestamp = 0 tg.maxTimestamp = duration for speaker, data in speaker_data.items(): for annotation_type, intervals in data.items(): if len(intervals): has_data = True if len(speaker_data) > 1: tier_name = f"{speaker} - {annotation_type}" else: tier_name = annotation_type if tier_name not in tg.tierNameList: tg.addTier( tgio.IntervalTier(tier_name, [], minT=0, maxT=duration)) for a in intervals: if duration - a.end < (frame_shift * 2): # Fix rounding issues a.end = duration tg.tierDict[tier_name].entryList.append(a.to_tg_interval()) if has_data: for tier in tg.tierDict.values(): if len(tier.entryList ) > 0 and tier.entryList[-1][1] > tg.maxTimestamp: tier.entryList[-1] = Interval(tier.entryList[-1].start, tg.maxTimestamp, tier.entryList[-1].label) tg.save(output_path, includeBlankSpaces=True, format=output_format, reportingMode="error")