def test_tok_all_words(self): """By default, all words should get tokenized""" txt = """<document xml:lang="fra"> <s>Bonjour! Comment ça va?</s> <s>Voici une deuxième phrase.</s> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) as_txt = etree.tounicode(tokenized) # print(etree.tounicode(tokenized)) ref = """<document xml:lang="fra"> <s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s> <s><w>Voici</w> <w>une</w> <w>deuxième</w> <w>phrase</w>.</s> </document>""" # print('as_txt="' + as_txt +'"') # print('ref="' + ref +'"') self.assertEqual(as_txt, ref) with_ids = add_ids(tokenized) ids_as_txt = etree.tounicode(with_ids) # print('with ids="' + ids_as_txt + '"') ref_with_ids = """<document xml:lang="fra"> <s id="s0"><w id="s0w0">Bonjour</w>! <w id="s0w1">Comment</w> <w id="s0w2">ça</w> <w id="s0w3">va</w>?</s> <s id="s1"><w id="s1w0">Voici</w> <w id="s1w1">une</w> <w id="s1w2">deuxième</w> <w id="s1w3">phrase</w>.</s> </document>""" self.assertEqual(ids_as_txt, ref_with_ids)
def test_dna_word_nested(self): """You also can't have a <w> element inside a DNA element""" txt = """<s xml:lang="fra">Une <foo do-not-align="true"><bar><w>exclude</w></bar></foo> phrase.</s>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) self.assertRaises(RuntimeError, add_ids, tokenized)
def test_tok_div_p_s(self): """Text inside a DNA div, p or s does not get tokenized""" txt = """<document xml:lang="fra"> <div> <p> <s>Une phrase.</s> </p> <p> <s>Deux phrases.</s> </p> </div> <div do-not-align="TRUE"> <p> <s>Une phrase.</s> </p> <p> <s>Deux phrases.</s> </p> </div> <div> <p do-not-align="1"> <s>Une phrase.</s> </p> <p> <s do-not-align="true">Deux phrases.</s> </p> <p> <s>Trois phrases.</s> </p> </div> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) as_txt = etree.tounicode(tokenized) # print('as_txt="' + as_txt +'"') ref = """<document xml:lang="fra"> <div> <p> <s><w>Une</w> <w>phrase</w>.</s> </p> <p> <s><w>Deux</w> <w>phrases</w>.</s> </p> </div> <div do-not-align="TRUE"> <p> <s>Une phrase.</s> </p> <p> <s>Deux phrases.</s> </p> </div> <div> <p do-not-align="1"> <s>Une phrase.</s> </p> <p> <s do-not-align="true">Deux phrases.</s> </p> <p> <s><w>Trois</w> <w>phrases</w>.</s> </p> </div> </document>""" self.assertEqual(as_txt, ref) with_ids = add_ids(tokenized) ids_as_txt = etree.tounicode(with_ids) # print('with ids="' + ids_as_txt + '"') ref_with_ids = """<document xml:lang="fra"> <div id="d0"> <p id="d0p0"> <s id="d0p0s0"><w id="d0p0s0w0">Une</w> <w id="d0p0s0w1">phrase</w>.</s> </p> <p id="d0p1"> <s id="d0p1s0"><w id="d0p1s0w0">Deux</w> <w id="d0p1s0w1">phrases</w>.</s> </p> </div> <div do-not-align="TRUE"> <p> <s>Une phrase.</s> </p> <p> <s>Deux phrases.</s> </p> </div> <div id="d1"> <p do-not-align="1"> <s>Une phrase.</s> </p> <p id="d1p0"> <s do-not-align="true">Deux phrases.</s> </p> <p id="d1p1"> <s id="d1p1s0"><w id="d1p1s0w0">Trois</w> <w id="d1p1s0w1">phrases</w>.</s> </p> </div> </document>""" self.assertEqual(ids_as_txt, ref_with_ids)
def test_tok_some_words(self): """do-not-align text is excluded from tokenization""" txt = """<document xml:lang="fra"> <p><s>Bonjour! Comment ça va?</s></p> <p do-not-align="true"><s>Bonjour! Comment ça va?</s></p> <s do-not-align="TRUE">Voici une deuxième phrase.</s> <s>Un <foo do-not-align="1">mot ou deux</foo> à exclure.</s> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) as_txt = etree.tounicode(tokenized) # print('as_txt="' + as_txt +'"') ref = """<document xml:lang="fra"> <p><s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s></p> <p do-not-align="true"><s>Bonjour! Comment ça va?</s></p> <s do-not-align="TRUE">Voici une deuxième phrase.</s> <s><w>Un</w> <foo do-not-align="1">mot ou deux</foo> <w>à</w> <w>exclure</w>.</s> </document>""" self.assertEqual(as_txt, ref) with_ids = add_ids(tokenized) ids_as_txt = etree.tounicode(with_ids) # print('with ids="' + ids_as_txt + '"') ref_with_ids = """<document xml:lang="fra"> <p id="p0"><s id="p0s0"><w id="p0s0w0">Bonjour</w>! <w id="p0s0w1">Comment</w> <w id="p0s0w2">ça</w> <w id="p0s0w3">va</w>?</s></p> <p do-not-align="true"><s>Bonjour! Comment ça va?</s></p> <s do-not-align="TRUE">Voici une deuxième phrase.</s> <s id="s0"><w id="s0w0">Un</w> <foo do-not-align="1">mot ou deux</foo> <w id="s0w1">à</w> <w id="s0w2">exclure</w>.</s> </document>""" self.assertEqual(ids_as_txt, ref_with_ids)
def test_dna_word(self): """You can't have a DNA <w> element, that's reserved for tokens to align""" txt = """<s xml:lang="fra">Une <w do-not-align="true">exclude</w> phrase.</s>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) self.assertRaises(RuntimeError, add_ids, tokenized)
def test_simple(self): txt = """<document> <s xml:lang="atj">Kwei! Tan e ici matisihin?</s> </document> """ xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) print(etree.tounicode(tokenized))
def end_to_end(xml, input_filename, unit, word_unit, out_orth): xml = tokenize_xml(xml) xml = add_ids(xml) converted_xml, valid = convert_xml(xml, word_unit, out_orth) # save_xml("test.xml", converted_xml) fsg = make_fsg(converted_xml, input_filename, unit) pronouncing_dictionary = make_dict(converted_xml, input_filename, unit) return xml, fsg, pronouncing_dictionary
def tokenize(**kwargs): """Tokenize XMLFILE for 'readalongs align' into TOKFILE. XMLFILE should have been produce by 'readalongs prepare'. TOKFILE can be augmented with word-specific language codes. 'readalongs align' can be called with either XMLFILE or TOKFILE as XML input. XMLFILE: Path to the XML file to tokenize, or - for stdin TOKFILE: Output path for the tok'd XML, or - for stdout [default: XMLFILE.tokenized.xml] """ xmlfile = kwargs["xmlfile"] if kwargs["debug"]: LOGGER.setLevel("DEBUG") LOGGER.info( "Running readalongs tokenize(xmlfile={}, tokfile={}, force-overwrite={})." .format( kwargs["xmlfile"], kwargs["tokfile"], kwargs["force_overwrite"], )) if not kwargs["tokfile"]: try: output_tok_path = xmlfile.name except Exception: output_tok_path = "<stdin>" if output_tok_path == "<stdin>": output_tok_path = "-" else: if output_tok_path.endswith(".xml"): output_tok_path = output_tok_path[:-4] output_tok_path += ".tokenized.xml" else: output_tok_path = kwargs["tokfile"] if not output_tok_path.endswith(".xml") and not output_tok_path == "-": output_tok_path += ".xml" if os.path.exists(output_tok_path) and not kwargs["force_overwrite"]: raise click.BadParameter( "Output file %s exists already, use -f to overwrite." % output_tok_path) try: xml = etree.parse(xmlfile).getroot() except etree.XMLSyntaxError as e: raise click.BadParameter( "Error parsing input file %s as XML, please verify it. Parser error: %s" % (xmlfile, e)) xml = tokenize_xml(xml) if output_tok_path == "-": write_xml(sys.stdout.buffer, xml) else: save_xml(output_tok_path, xml) LOGGER.info("Wrote {}".format(output_tok_path))
def test_comments(self): txt = """<document> <s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s> <s xml:lang="atj">Tan e ici matisihin?</s> </document> """ xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) print(etree.tounicode(tokenized))
def test_mixed_lang(self): txt = """<document> <s xml:lang="atj">Kwei! Tan e ici matisihin?</s> <s xml:lang="fra">Bonjour! Comment ça va?</s> </document> """ xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) print(etree.tounicode(tokenized))
def test_simple(self): txt = """<document> <s xml:lang="atj">Kwei! Tan e ici matisihin?</s> </document> """ ref = """<document> <s xml:lang="atj"><w>Kwei</w>! <w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) # print(etree.tounicode(tokenized)) self.assertEqual(etree.tounicode(tokenized), ref)
def test_mixed_words(self): txt = """<document> <s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s> <s xml:lang="atj">Tan e ici matisihin?</s> </document> """ ref = """<document> <s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s> <s xml:lang="atj">Tan e ici matisihin?</s> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) # print(etree.tounicode(tokenized)) self.assertEqual(etree.tounicode(tokenized), ref)
def test_mixed_lang(self): txt = """<document> <s xml:lang="atj">Kwei! Tan e ici matisihin?</s> <s xml:lang="fra">Bonjour! Comment ça va?</s> </document> """ ref = """<document> <s xml:lang="atj"><w>Kwei</w>! <w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s> <s xml:lang="fra"><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) # print(etree.tounicode(tokenized)) self.assertEqual(etree.tounicode(tokenized), ref)
def test_mixed_words(self): """Tokenization should be bypassed when <w> elements are already found in the input""" txt = """<document> <s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s> <s xml:lang="atj">Tan e ici matisihin?</s> </document> """ ref = """<document> <s xml:lang="atj">Kwei! (<w xml:lang="fra">Bonjour</w>!)</s> <s xml:lang="atj">Tan e ici matisihin?</s> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) # print(etree.tounicode(tokenized)) self.assertEqual(etree.tounicode(tokenized), ref)
def test_comments(self): """Make sure tokenize_xml ignores stuff inside comments""" txt = """<document> <s xml:lang="atj">Kwei! (<subsent xml:lang="fra">Bonjour</subsent>!)</s> <!--<s>comments</s> <w>should</w> <p>be ignored</p>--> <s xml:lang="atj">Tan e ici matisihin?</s> </document> """ ref = """<document> <s xml:lang="atj"><w>Kwei</w>! (<subsent xml:lang="fra"><w>Bonjour</w></subsent>!)</s> <!--<s>comments</s> <w>should</w> <p>be ignored</p>--> <s xml:lang="atj"><w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s> </document>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) # print(etree.tounicode(tokenized)) self.assertEqual(etree.tounicode(tokenized), ref)
def test_dna_word(self): txt = """<s xml:lang="fra">Une <w do-not-align="true">exclude</w> phrase.</s>""" xml = etree.fromstring(txt) tokenized = tokenize_xml.tokenize_xml(xml) self.assertRaises(RuntimeError, add_ids, tokenized)
def align_audio( xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, ): """ Align an XML input file to an audio file. Parameters ---------- xml_path : str Path to XML input file in TEI-like format audio_path : str Path to audio input. Must be in a format supported by ffmpeg unit : str, optional Element to create alignments for, by default 'w' bare : boolean, optional If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config : object, optional Uses ReadAlong-Studio configuration save_temps : Union[str, None], optional save temporary files, by default None #TODO: document return Returns ------- [type] [description] #TODO: document exceptions Raises ------ RuntimeError [description] RuntimeError [description] RuntimeError [description] RuntimeError [description] """ results: Dict[str, List] = {"words": []} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = add_lang_ids(xml, unit="s") xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml = convert_xml(xml) if save_temps: save_xml(save_temps + ".g2p.xml", xml) # Now generate dictionary and FSG dict_data = make_dict(xml, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict", "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.flush() fsg_data = make_fsg(xml, xml_path, unit=unit) if save_temps: fsg_file = io.open(save_temps + ".fsg", "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.flush() # Now do alignment cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio do_not_align_segments = None if config and "do-not-align" in config: # Reverse sort un-alignable segments do_not_align_segments = sorted( config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True ) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method == "mute" or method == "remove": processed_audio = audio for seg in do_not_align_segments: processed_audio = dna_method( processed_audio, int(seg["begin"]), int(seg["end"]) ) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export( save_temps + "_processed" + ext, format=ext[1:] ) except CouldntEncodeError: os.remove(save_temps + "_processed" + ext) LOGGER.warn( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") raw_data = processed_audio.raw_data else: raw_data = audio.raw_data frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) ps = soundswallower.Decoder(cfg) frame_size = 1.0 / cfg.get_int("-frate") def frames_to_time(frames): return frames * frame_size ps.start_utt() ps.process_raw(raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text." ) for seg in ps.seg(): start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if do_not_align_segments and method == "remove": start_ms += calculate_adjustment(start_ms, do_not_align_segments) end_ms += calculate_adjustment(end_ms, do_not_align_segments) start_ms, end_ms = correct_adjustments( start_ms, end_ms, do_not_align_segments ) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 if seg.word in ("<sil>", "[NOISE]"): continue else: results["words"].append({"id": seg.word, "start": start, "end": end}) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please examine dictionary and input audio and text." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): raise RuntimeError( "Alignment produced a different number of segments and tokens, " "please examine dictionary and input audio and text." ) final_end = end if not bare: # Split adjoining silence/noise between words last_end = 0.0 last_word = dict() for word in results["words"]: silence = word["start"] - last_end midpoint = last_end + silence / 2 if silence > 0: if last_word: last_word["end"] = midpoint word["start"] = midpoint last_word = word last_end = word["end"] silence = final_end - last_end if silence > 0: if last_word is not None: last_word["end"] += silence / 2 dict_file.close() if not save_temps: os.unlink(dict_file.name) fsg_file.close() if not save_temps: os.unlink(fsg_file.name) return results
def align_audio( # noqa: C901 xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None, verbose_g2p_warnings=False, ): """Align an XML input file to an audio file. Args: xml_path (str): Path to XML input file in TEI-like format audio_path (str): Path to audio input. Must be in a format supported by ffmpeg unit (str): Optional; Element to create alignments for, by default 'w' bare (boolean): Optional; If False, split silence into adjoining tokens (default) If True, keep the bare tokens without adjoining silences. config (object): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings iff True Returns: Dict[str, List]: TODO Raises: TODO """ results: Dict[str, List] = {"words": [], "audio": None} # First do G2P try: xml = etree.parse(xml_path).getroot() except etree.XMLSyntaxError as e: raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e)) from e if config and "images" in config: xml = add_images(xml, config) if config and "xml" in config: xml = add_supplementary_xml(xml, config) xml = tokenize_xml(xml) if save_temps: save_xml(save_temps + ".tokenized.xml", xml) results["tokenized"] = xml = add_ids(xml) if save_temps: save_xml(save_temps + ".ids.xml", xml) xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings) if save_temps: save_xml(save_temps + ".g2p.xml", xml) if not valid: raise RuntimeError( "Some words could not be g2p'd correctly. Aborting. " "Run with --g2p-verbose for more detailed g2p error logs.") # Prepare the SoundsSwallower (formerly PocketSphinx) configuration cfg = soundswallower.Decoder.default_config() model_path = soundswallower.get_model_path() cfg.set_boolean("-remove_noise", False) cfg.set_boolean("-remove_silence", False) cfg.set_string("-hmm", os.path.join(model_path, "en-us")) # cfg.set_string('-samprate', "no no") cfg.set_float("-beam", 1e-100) cfg.set_float("-wbeam", 1e-80) # Read the audio file audio = read_audio_from_file(audio_path) audio = audio.set_channels(1).set_sample_width(2) audio_length_in_ms = len(audio.raw_data) # Downsampling is (probably) not necessary cfg.set_float("-samprate", audio.frame_rate) # Process audio, silencing or removing any DNA segments dna_segments = [] removed_segments = [] if config and "do-not-align" in config: # Sort un-alignable segments and join overlapping ones dna_segments = sort_and_join_dna_segments( config["do-not-align"]["segments"]) method = config["do-not-align"].get("method", "remove") # Determine do-not-align method if method == "mute": dna_method = mute_section elif method == "remove": dna_method = remove_section else: LOGGER.error("Unknown do-not-align method declared") # Process audio and save temporary files if method in ("mute", "remove"): processed_audio = audio # Process the DNA segments in reverse order so we don't have to correct # for previously processed ones when using the "remove" method. for seg in reversed(dna_segments): processed_audio = dna_method(processed_audio, int(seg["begin"]), int(seg["end"])) if save_temps: _, ext = os.path.splitext(audio_path) try: processed_audio.export(save_temps + "_processed" + ext, format=ext[1:]) except CouldntEncodeError: try: os.remove(save_temps + "_processed" + ext) except BaseException: pass LOGGER.warning( f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) processed_audio.export(save_temps + "_processed" + ".wav") removed_segments = dna_segments audio_data = processed_audio else: audio_data = audio # Initialize the SoundSwallower decoder with the sample rate from the audio frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 cfg.set_int("-nfft", fft_size) frame_size = 1.0 / cfg.get_int("-frate") # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100), # while the audio segments manipulated using pydub are sliced and accessed in # millisecond intervals. For audio segments, the ms slice assumption is hard-coded # all over, while frames_to_time() is used to convert segment boundaries returned by # soundswallower, which are indexes in frames, into durations in seconds. def frames_to_time(frames): return frames * frame_size # Extract the list of sequences of words in the XML word_sequences = get_sequences(xml, xml_path, unit=unit) end = 0 for i, word_sequence in enumerate(word_sequences): i_suffix = "" if i == 0 else "." + str(i + 1) # Generate dictionary and FSG for the current sequence of words dict_data = make_dict(word_sequence.words, xml_path, unit=unit) if save_temps: dict_file = io.open(save_temps + ".dict" + i_suffix, "wb") else: dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False) dict_file.write(dict_data.encode("utf-8")) dict_file.close() fsg_data = make_fsg(word_sequence.words, xml_path) if save_temps: fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb") else: fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False) fsg_file.write(fsg_data.encode("utf-8")) fsg_file.close() # Extract the part of the audio corresponding to this word sequence audio_segment = extract_section(audio_data, word_sequence.start, word_sequence.end) if save_temps and audio_segment is not audio_data: write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix) # Configure soundswallower for this sequence's dict and fsg cfg.set_string("-dict", dict_file.name) cfg.set_string("-fsg", fsg_file.name) ps = soundswallower.Decoder(cfg) # Align this word sequence ps.start_utt() ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True) ps.end_utt() if not ps.seg(): raise RuntimeError( "Alignment produced no segments, " "please examine dictionary and input audio and text.") # List of removed segments for the sequence we are currently processing curr_removed_segments = dna_union(word_sequence.start, word_sequence.end, audio_length_in_ms, removed_segments) prev_segment_count = len(results["words"]) for seg in ps.seg(): if seg.word in ("<sil>", "[NOISE]"): continue start = frames_to_time(seg.start_frame) end = frames_to_time(seg.end_frame + 1) # change to ms start_ms = start * 1000 end_ms = end * 1000 if curr_removed_segments: start_ms += calculate_adjustment(start_ms, curr_removed_segments) end_ms += calculate_adjustment(end_ms, curr_removed_segments) start_ms, end_ms = correct_adjustments(start_ms, end_ms, curr_removed_segments) # change back to seconds to write to smil start = start_ms / 1000 end = end_ms / 1000 results["words"].append({ "id": seg.word, "start": start, "end": end }) LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) aligned_segment_count = len(results["words"]) - prev_segment_count if aligned_segment_count != len(word_sequence.words): LOGGER.warning( f"Word sequence {i+1} had {len(word_sequence.words)} tokens " f"but produced {aligned_segment_count} segments. " "Check that the anchors are well positioned or " "that the audio corresponds to the text.") final_end = end if len(results["words"]) == 0: raise RuntimeError( "Alignment produced only noise or silence segments, " "please verify that the text is an actual transcript of the audio." ) if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): LOGGER.warning( "Alignment produced a different number of segments and tokens than " "were in the input. Sequences between some anchors probably did not " "align successfully. Look for more anchors-related warnings above in the log." ) if not bare: # Take all the boundaries (anchors) around segments and add them as DNA # segments for the purpose of splitting silences dna_for_silence_splitting = copy.deepcopy(dna_segments) last_end = None for seq in word_sequences: if last_end or seq.start: dna_for_silence_splitting.append({ "begin": (last_end or seq.start), "end": (seq.start or last_end) }) last_end = seq.end if last_end: dna_for_silence_splitting.append({ "begin": last_end, "end": last_end }) dna_for_silence_splitting = sort_and_join_dna_segments( dna_for_silence_splitting) split_silences(results["words"], final_end, dna_for_silence_splitting) words_dict = { x["id"]: { "start": x["start"], "end": x["end"] } for x in results["words"] } silence_offsets = defaultdict(int) silence = 0 if results["tokenized"].xpath("//silence"): endpoint = 0 all_good = True for el in results["tokenized"].xpath("//*"): if el.tag == "silence" and "dur" in el.attrib: try: silence_ms = parse_time(el.attrib["dur"]) except ValueError as err: LOGGER.error( f'Invalid silence element in {xml_path}: invalid "time" ' f'attribute "{el.attrib["dur"]}": {err}') all_good = False continue silence_segment = AudioSegment.silent( duration=silence_ms) # create silence segment silence += silence_ms # add silence length to total silence audio = (audio[:endpoint] + silence_segment + audio[endpoint:] ) # insert silence at previous endpoint endpoint += silence_ms # add silence to previous endpoint if el.tag == "w": silence_offsets[el.attrib["id"]] += ( silence / 1000 ) # add silence in seconds to silence offset for word id endpoint = (words_dict[el.attrib["id"]]["end"] * 1000 ) + silence # bump endpoint and include silence if not all_good: raise RuntimeError( f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence " 'element is properly formatted, e.g., <silence dur="1.5s"/>. Aborting.' ) if silence: for word in results["words"]: word["start"] += silence_offsets[word["id"]] word["end"] += silence_offsets[word["id"]] results["audio"] = audio return results