Exemple #1
0
def go(input_filename,
       output_filename,
       word_unit="w",
       output_orthography="eng-arpabet"):
    xml = load_xml(input_filename)
    converted_xml = convert_xml(xml, word_unit, output_orthography)
    save_xml(output_filename, converted_xml)
Exemple #2
0
def tokenize(**kwargs):
    """Tokenize XMLFILE for 'readalongs align' into TOKFILE.
    XMLFILE should have been produce by 'readalongs prepare'.
    TOKFILE can be augmented with word-specific language codes.
    'readalongs align' can be called with either XMLFILE or TOKFILE as XML input.

    XMLFILE: Path to the XML file to tokenize, or - for stdin

    TOKFILE: Output path for the tok'd XML, or - for stdout [default: XMLFILE.tokenized.xml]
    """
    xmlfile = kwargs["xmlfile"]

    if kwargs["debug"]:
        LOGGER.setLevel("DEBUG")
        LOGGER.info(
            "Running readalongs tokenize(xmlfile={}, tokfile={}, force-overwrite={})."
            .format(
                kwargs["xmlfile"],
                kwargs["tokfile"],
                kwargs["force_overwrite"],
            ))

    if not kwargs["tokfile"]:
        try:
            output_tok_path = xmlfile.name
        except Exception:
            output_tok_path = "<stdin>"
        if output_tok_path == "<stdin>":
            output_tok_path = "-"
        else:
            if output_tok_path.endswith(".xml"):
                output_tok_path = output_tok_path[:-4]
            output_tok_path += ".tokenized.xml"
    else:
        output_tok_path = kwargs["tokfile"]
        if not output_tok_path.endswith(".xml") and not output_tok_path == "-":
            output_tok_path += ".xml"

    if os.path.exists(output_tok_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." %
            output_tok_path)

    try:
        xml = etree.parse(xmlfile).getroot()
    except etree.XMLSyntaxError as e:
        raise click.BadParameter(
            "Error parsing input file %s as XML, please verify it. Parser error: %s"
            % (xmlfile, e))

    xml = tokenize_xml(xml)

    if output_tok_path == "-":
        write_xml(sys.stdout.buffer, xml)
    else:
        save_xml(output_tok_path, xml)
    LOGGER.info("Wrote {}".format(output_tok_path))
 def testConvert(self):
     xml_path = os.path.join(self.data_dir, "ej-fra-converted.xml")
     xml = etree.parse(xml_path).getroot()
     convert_to_xhtml(xml)
     with PortableNamedTemporaryFile(suffix=".xml") as tf:
         save_xml(tf.name, xml)
         txt = load_txt(tf.name)
         self.maxDiff = None
         self.assertEqual(
             txt,
             load_txt(os.path.join(self.data_dir,
                                   "ej-fra-converted.xhtml")),
         )
def go(
    input_filename,
    output_xml_filename,
    output_fsg_filename,
    output_dict_filename,
    unit,
    word_unit,
    out_orth,
):
    xml = load_xml(input_filename)
    xml, fsg, dct = end_to_end(xml, input_filename, unit, word_unit, out_orth)
    save_xml(output_xml_filename, xml)
    save_txt(output_fsg_filename, fsg)
    save_txt(output_dict_filename, dct)
Exemple #5
0
def main(input_path, output_path, unit="p"):
    xml = load_xml(input_path)
    add_lang_ids(xml, unit)
    save_xml(output_path, xml)
Exemple #6
0
def save_readalong(  # noqa C901
        # noqa C901 - ignore the complexity of this function
        # this * forces all arguments to be passed by name, because I don't want any
        # code to depend on their order in the future
        *,
        align_results: Dict[str, List],
        output_dir: str,
        output_basename: str,
        config=None,
        audiofile: str,
        audiosegment: AudioSegment = None,
        output_formats=(),
):
    """Save the results from align_audio() into the output files required for a
        readalong

    Args:
        align_results(Dict[str,List]): return value from align_audio()
        output_dir (str): directory where to save the readalong,
            output_dir should already exist, files it contains may be overwritten
        output_basename (str): basename of the files to save in output_dir
        config ([type TODO], optional): alignment configuration loaded from the json
        audiofile (str): path to the audio file passed to align_audio()
        output_formats (List[str], optional): list of desired output formats
        audiosegment (AudioSegment): a pydub.AudioSegment object of processed audio.
                              if None, then original audio will be saved at `audiofile`

    Returns:
        None

    Raises:
        [TODO]
    """
    # Round all times to three digits, anything more is excess precision
    # poluting the output files, and usually due to float rounding errors anyway.
    for w in align_results["words"]:
        w["start"] = round(w["start"], 3)
        w["end"] = round(w["end"], 3)

    output_base = os.path.join(output_dir, output_basename)

    # Create textgrid object if outputting to TextGrid or eaf
    if "TextGrid" in output_formats or "eaf" in output_formats:
        audio = read_audio_from_file(audiofile)
        duration = audio.frame_count() / audio.frame_rate
        words, sentences = return_words_and_sentences(align_results)
        textgrid = write_to_text_grid(words, sentences, duration)

        if "TextGrid" in output_formats:
            textgrid.to_file(output_base + ".TextGrid")

        if "eaf" in output_formats:
            textgrid.to_eaf().to_file(output_base + ".eaf")

    # Create webvtt object if outputting to vtt or srt
    if "srt" in output_formats or "vtt" in output_formats:
        words, sentences = return_words_and_sentences(align_results)
        cc_sentences = write_to_subtitles(sentences)
        cc_words = write_to_subtitles(words)

        if "srt" in output_formats:
            cc_sentences.save_as_srt(output_base + "_sentences.srt")
            cc_words.save_as_srt(output_base + "_words.srt")

        if "vtt" in output_formats:
            cc_words.save(output_base + "_words.vtt")
            cc_sentences.save(output_base + "_sentences.vtt")

    tokenized_xml_path = output_base + ".xml"
    save_xml(tokenized_xml_path, align_results["tokenized"])

    if "xhtml" in output_formats:
        convert_to_xhtml(align_results["tokenized"])
        tokenized_xhtml_path = output_base + ".xhtml"
        save_xml(tokenized_xhtml_path, align_results["tokenized"])

    _, audio_ext = os.path.splitext(audiofile)
    audio_path = output_base + audio_ext
    audio_format = audio_ext[1:]
    if audiosegment:
        if audio_format in ["m4a", "aac"]:
            audio_format = "ipod"
        try:
            audiosegment.export(audio_path, format=audio_format)
        except CouldntEncodeError:
            LOGGER.warning(f"The audio file at {audio_path} could \
                not be exported in the {audio_format} format. \
                Please ensure your installation of ffmpeg has \
                the necessary codecs.")
            audio_path = output_base + ".wav"
            audiosegment.export(audio_path, format="wav")
    else:
        shutil.copy(audiofile, audio_path)

    smil_path = output_base + ".smil"
    smil = make_smil(
        os.path.basename(tokenized_xml_path),
        os.path.basename(audio_path),
        align_results,
    )
    save_txt(smil_path, smil)

    if "html" in output_formats:
        html_out_path = output_base + ".html"
        html_out = create_web_component_html(tokenized_xml_path, smil_path,
                                             audio_path)
        with open(html_out_path, "w") as f:
            f.write(html_out)

    save_minimal_index_html(
        os.path.join(output_dir, "index.html"),
        os.path.basename(tokenized_xml_path),
        os.path.basename(smil_path),
        os.path.basename(audio_path),
    )

    # Copy the image files to the output's asset directory, if any are found
    if config and "images" in config:
        assets_dir = os.path.join(output_dir, "assets")
        try:
            os.mkdir(assets_dir)
        except FileExistsError:
            if not os.path.isdir(assets_dir):
                raise
        for _, image in config["images"].items():
            if image[0:4] == "http":
                LOGGER.warning(
                    f"Please make sure {image} is accessible to clients using your read-along."
                )
            else:
                try:
                    shutil.copy(image, assets_dir)
                except Exception as e:
                    LOGGER.warning(
                        f"Please copy {image} to {assets_dir} before deploying your read-along. ({e})"
                    )
                if os.path.basename(image) != image:
                    LOGGER.warning(
                        f"Read-along images were tested with absolute urls (starting with http(s):// "
                        f"and filenames without a path. {image} might not work as specified."
                    )
Exemple #7
0
def align_audio(  # noqa: C901
    xml_path,
    audio_path,
    unit="w",
    bare=False,
    config=None,
    save_temps=None,
    verbose_g2p_warnings=False,
):
    """Align an XML input file to an audio file.

    Args:
        xml_path (str): Path to XML input file in TEI-like format
        audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
        unit (str): Optional; Element to create alignments for, by default 'w'
        bare (boolean): Optional;
            If False, split silence into adjoining tokens (default)
            If True, keep the bare tokens without adjoining silences.
        config (object): Optional; ReadAlong-Studio configuration to use
        save_temps (str): Optional; Save temporary files, by default None
        verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
            iff True

    Returns:
        Dict[str, List]: TODO

    Raises:
        TODO
    """
    results: Dict[str, List] = {"words": [], "audio": None}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." %
                           (xml_path, e)) from e
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)
    if not valid:
        raise RuntimeError(
            "Some words could not be g2p'd correctly. Aborting. "
            "Run with --g2p-verbose for more detailed g2p error logs.")

    # Prepare the SoundsSwallower (formerly PocketSphinx) configuration
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    # Read the audio file
    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    audio_length_in_ms = len(audio.raw_data)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio, silencing or removing any DNA segments
    dna_segments = []
    removed_segments = []
    if config and "do-not-align" in config:
        # Sort un-alignable segments and join overlapping ones
        dna_segments = sort_and_join_dna_segments(
            config["do-not-align"]["segments"])
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method in ("mute", "remove"):
            processed_audio = audio
            # Process the DNA segments in reverse order so we don't have to correct
            # for previously processed ones when using the "remove" method.
            for seg in reversed(dna_segments):
                processed_audio = dna_method(processed_audio,
                                             int(seg["begin"]),
                                             int(seg["end"]))
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(save_temps + "_processed" + ext,
                                           format=ext[1:])
                except CouldntEncodeError:
                    try:
                        os.remove(save_temps + "_processed" + ext)
                    except BaseException:
                        pass
                    LOGGER.warning(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
            removed_segments = dna_segments
        audio_data = processed_audio
    else:
        audio_data = audio

    # Initialize the SoundSwallower decoder with the sample rate from the audio
    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    frame_size = 1.0 / cfg.get_int("-frate")

    # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100),
    # while the audio segments manipulated using pydub are sliced and accessed in
    # millisecond intervals. For audio segments, the ms slice assumption is hard-coded
    # all over, while frames_to_time() is used to convert segment boundaries returned by
    # soundswallower, which are indexes in frames, into durations in seconds.
    def frames_to_time(frames):
        return frames * frame_size

    # Extract the list of sequences of words in the XML
    word_sequences = get_sequences(xml, xml_path, unit=unit)
    end = 0
    for i, word_sequence in enumerate(word_sequences):

        i_suffix = "" if i == 0 else "." + str(i + 1)

        # Generate dictionary and FSG for the current sequence of words
        dict_data = make_dict(word_sequence.words, xml_path, unit=unit)
        if save_temps:
            dict_file = io.open(save_temps + ".dict" + i_suffix, "wb")
        else:
            dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_",
                                                   delete=False)
        dict_file.write(dict_data.encode("utf-8"))
        dict_file.close()

        fsg_data = make_fsg(word_sequence.words, xml_path)
        if save_temps:
            fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb")
        else:
            fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_",
                                                  delete=False)
        fsg_file.write(fsg_data.encode("utf-8"))
        fsg_file.close()

        # Extract the part of the audio corresponding to this word sequence
        audio_segment = extract_section(audio_data, word_sequence.start,
                                        word_sequence.end)
        if save_temps and audio_segment is not audio_data:
            write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix)

        # Configure soundswallower for this sequence's dict and fsg
        cfg.set_string("-dict", dict_file.name)
        cfg.set_string("-fsg", fsg_file.name)
        ps = soundswallower.Decoder(cfg)
        # Align this word sequence
        ps.start_utt()
        ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True)
        ps.end_utt()

        if not ps.seg():
            raise RuntimeError(
                "Alignment produced no segments, "
                "please examine dictionary and input audio and text.")

        # List of removed segments for the sequence we are currently processing
        curr_removed_segments = dna_union(word_sequence.start,
                                          word_sequence.end,
                                          audio_length_in_ms, removed_segments)

        prev_segment_count = len(results["words"])
        for seg in ps.seg():
            if seg.word in ("<sil>", "[NOISE]"):
                continue
            start = frames_to_time(seg.start_frame)
            end = frames_to_time(seg.end_frame + 1)
            # change to ms
            start_ms = start * 1000
            end_ms = end * 1000
            if curr_removed_segments:
                start_ms += calculate_adjustment(start_ms,
                                                 curr_removed_segments)
                end_ms += calculate_adjustment(end_ms, curr_removed_segments)
                start_ms, end_ms = correct_adjustments(start_ms, end_ms,
                                                       curr_removed_segments)
                # change back to seconds to write to smil
                start = start_ms / 1000
                end = end_ms / 1000
            results["words"].append({
                "id": seg.word,
                "start": start,
                "end": end
            })
            LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)
        aligned_segment_count = len(results["words"]) - prev_segment_count
        if aligned_segment_count != len(word_sequence.words):
            LOGGER.warning(
                f"Word sequence {i+1} had {len(word_sequence.words)} tokens "
                f"but produced {aligned_segment_count} segments. "
                "Check that the anchors are well positioned or "
                "that the audio corresponds to the text.")
    final_end = end

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please verify that the text is an actual transcript of the audio."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        LOGGER.warning(
            "Alignment produced a different number of segments and tokens than "
            "were in the input. Sequences between some anchors probably did not "
            "align successfully. Look for more anchors-related warnings above in the log."
        )

    if not bare:
        # Take all the boundaries (anchors) around segments and add them as DNA
        # segments for the purpose of splitting silences
        dna_for_silence_splitting = copy.deepcopy(dna_segments)
        last_end = None
        for seq in word_sequences:
            if last_end or seq.start:
                dna_for_silence_splitting.append({
                    "begin": (last_end or seq.start),
                    "end": (seq.start or last_end)
                })
            last_end = seq.end
        if last_end:
            dna_for_silence_splitting.append({
                "begin": last_end,
                "end": last_end
            })
        dna_for_silence_splitting = sort_and_join_dna_segments(
            dna_for_silence_splitting)

        split_silences(results["words"], final_end, dna_for_silence_splitting)
    words_dict = {
        x["id"]: {
            "start": x["start"],
            "end": x["end"]
        }
        for x in results["words"]
    }
    silence_offsets = defaultdict(int)
    silence = 0
    if results["tokenized"].xpath("//silence"):
        endpoint = 0
        all_good = True
        for el in results["tokenized"].xpath("//*"):
            if el.tag == "silence" and "dur" in el.attrib:
                try:
                    silence_ms = parse_time(el.attrib["dur"])
                except ValueError as err:
                    LOGGER.error(
                        f'Invalid silence element in {xml_path}: invalid "time" '
                        f'attribute "{el.attrib["dur"]}": {err}')
                    all_good = False
                    continue
                silence_segment = AudioSegment.silent(
                    duration=silence_ms)  # create silence segment
                silence += silence_ms  # add silence length to total silence
                audio = (audio[:endpoint] + silence_segment + audio[endpoint:]
                         )  # insert silence at previous endpoint
                endpoint += silence_ms  # add silence to previous endpoint
            if el.tag == "w":
                silence_offsets[el.attrib["id"]] += (
                    silence / 1000
                )  # add silence in seconds to silence offset for word id
                endpoint = (words_dict[el.attrib["id"]]["end"] * 1000
                            ) + silence  # bump endpoint and include silence
        if not all_good:
            raise RuntimeError(
                f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence "
                'element is properly formatted, e.g., <silence dur="1.5s"/>.  Aborting.'
            )
    if silence:
        for word in results["words"]:
            word["start"] += silence_offsets[word["id"]]
            word["end"] += silence_offsets[word["id"]]
        results["audio"] = audio
    return results
Exemple #8
0
def align_audio(
    xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None,
):
    """ Align an XML input file to an audio file.

    Parameters
    ----------
    xml_path : str
        Path to XML input file in TEI-like format
    audio_path : str
        Path to audio input. Must be in a format supported by ffmpeg
    unit : str, optional
        Element to create alignments for, by default 'w'
    bare : boolean, optional
        If False, split silence into adjoining tokens (default)
        If True, keep the bare tokens without adjoining silences.
    config : object, optional
        Uses ReadAlong-Studio configuration
    save_temps : Union[str, None], optional
        save temporary files, by default None

    #TODO: document return
    Returns
    -------
    [type]
        [description]

    #TODO: document exceptions
    Raises
    ------
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    """
    results: Dict[str, List] = {"words": []}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e))
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = add_lang_ids(xml, unit="s")
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml = convert_xml(xml)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)

    # Now generate dictionary and FSG
    dict_data = make_dict(xml, xml_path, unit=unit)
    if save_temps:
        dict_file = io.open(save_temps + ".dict", "wb")
    else:
        dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False)
    dict_file.write(dict_data.encode("utf-8"))
    dict_file.flush()
    fsg_data = make_fsg(xml, xml_path, unit=unit)
    if save_temps:
        fsg_file = io.open(save_temps + ".fsg", "wb")
    else:
        fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False)
    fsg_file.write(fsg_data.encode("utf-8"))
    fsg_file.flush()

    # Now do alignment
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    cfg.set_string("-dict", dict_file.name)
    cfg.set_string("-fsg", fsg_file.name)
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio
    do_not_align_segments = None
    if config and "do-not-align" in config:
        # Reverse sort un-alignable segments
        do_not_align_segments = sorted(
            config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True
        )
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method == "mute" or method == "remove":
            processed_audio = audio
            for seg in do_not_align_segments:
                processed_audio = dna_method(
                    processed_audio, int(seg["begin"]), int(seg["end"])
                )
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(
                        save_temps + "_processed" + ext, format=ext[1:]
                    )
                except CouldntEncodeError:
                    os.remove(save_temps + "_processed" + ext)
                    LOGGER.warn(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
        raw_data = processed_audio.raw_data
    else:
        raw_data = audio.raw_data

    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    ps = soundswallower.Decoder(cfg)
    frame_size = 1.0 / cfg.get_int("-frate")

    def frames_to_time(frames):
        return frames * frame_size

    ps.start_utt()
    ps.process_raw(raw_data, no_search=False, full_utt=True)
    ps.end_utt()

    if not ps.seg():
        raise RuntimeError(
            "Alignment produced no segments, "
            "please examine dictionary and input audio and text."
        )

    for seg in ps.seg():
        start = frames_to_time(seg.start_frame)
        end = frames_to_time(seg.end_frame + 1)
        # change to ms
        start_ms = start * 1000
        end_ms = end * 1000
        if do_not_align_segments and method == "remove":
            start_ms += calculate_adjustment(start_ms, do_not_align_segments)
            end_ms += calculate_adjustment(end_ms, do_not_align_segments)
            start_ms, end_ms = correct_adjustments(
                start_ms, end_ms, do_not_align_segments
            )
            # change back to seconds to write to smil
            start = start_ms / 1000
            end = end_ms / 1000
        if seg.word in ("<sil>", "[NOISE]"):
            continue
        else:
            results["words"].append({"id": seg.word, "start": start, "end": end})
        LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please examine dictionary and input audio and text."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        raise RuntimeError(
            "Alignment produced a different number of segments and tokens, "
            "please examine dictionary and input audio and text."
        )

    final_end = end

    if not bare:
        # Split adjoining silence/noise between words
        last_end = 0.0
        last_word = dict()
        for word in results["words"]:
            silence = word["start"] - last_end
            midpoint = last_end + silence / 2
            if silence > 0:
                if last_word:
                    last_word["end"] = midpoint
                word["start"] = midpoint
            last_word = word
            last_end = word["end"]
        silence = final_end - last_end
        if silence > 0:
            if last_word is not None:
                last_word["end"] += silence / 2
    dict_file.close()
    if not save_temps:
        os.unlink(dict_file.name)
    fsg_file.close()
    if not save_temps:
        os.unlink(fsg_file.name)

    return results
Exemple #9
0
def go(input_filename, output_filename):
    xml = load_xml(input_filename)
    xml = tokenize_xml(xml)
    save_xml(output_filename, xml)
Exemple #10
0
def go(input_filename: str, output_filename: str) -> None:
    xml = load_xml(input_filename)
    xml = add_ids(xml)
    save_xml(output_filename, xml)
Exemple #11
0
def align(**kwargs):
    """Align TEXTFILE and AUDIOFILE and create output files as OUTPUT_BASE.* in directory
    OUTPUT_BASE/.

    TEXTFILE:    Input text file path (in XML, or plain text with -i)

    AUDIOFILE:   Input audio file path, in any format supported by ffmpeg

    OUTPUT_BASE: Base name for output files
    """
    config = kwargs.get("config", None)
    if config:
        if config.endswith("json"):
            try:
                with open(config) as f:
                    config = json.load(f)
            except json.decoder.JSONDecodeError:
                LOGGER.error(f"Config file at {config} is not valid json.")
        else:
            raise click.BadParameter(f"Config file '{config}' must be in JSON format")

    output_dir = kwargs["output_base"]
    if os.path.exists(output_dir):
        if not os.path.isdir(output_dir):
            raise click.UsageError(
                f"Output folder '{output_dir}' already exists but is a not a directory."
            )
        if not kwargs["force_overwrite"]:
            raise click.UsageError(
                f"Output folder '{output_dir}' already exists, use -f to overwrite."
            )
    else:
        os.mkdir(output_dir)

    # Make sure we can write to the output directory, for early error checking and user
    # friendly error messages.
    try:
        with TemporaryFile(dir=output_dir):
            pass
    except Exception:
        raise click.UsageError(
            f"Cannot write into output folder '{output_dir}'. Please verify permissions."
        )

    output_basename = os.path.basename(output_dir)
    output_base = os.path.join(output_dir, output_basename)
    temp_base = None
    if kwargs["save_temps"]:
        temp_dir = os.path.join(output_dir, "tempfiles")
        if not os.path.isdir(temp_dir):
            if os.path.exists(temp_dir) and kwargs["force_overwrite"]:
                os.unlink(temp_dir)
            os.mkdir(temp_dir)
        temp_base = os.path.join(temp_dir, output_basename)

    if kwargs["debug"]:
        LOGGER.setLevel("DEBUG")
    if kwargs["text_input"]:
        if not kwargs["language"]:
            LOGGER.warn("No input language provided, using undetermined mapping")
        tempfile, kwargs["textfile"] = create_input_tei(
            input_file_name=kwargs["textfile"],
            text_language=kwargs["language"],
            save_temps=temp_base,
        )
    if kwargs["output_xhtml"]:
        tokenized_xml_path = "%s.xhtml" % output_base
    else:
        _, input_ext = os.path.splitext(kwargs["textfile"])
        tokenized_xml_path = "%s%s" % (output_base, input_ext)
    if os.path.exists(tokenized_xml_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." % tokenized_xml_path
        )
    smil_path = output_base + ".smil"
    if os.path.exists(smil_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." % smil_path
        )
    _, audio_ext = os.path.splitext(kwargs["audiofile"])
    audio_path = output_base + audio_ext
    if os.path.exists(audio_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." % audio_path
        )
    unit = kwargs.get("unit", "w")
    bare = kwargs.get("bare", False)
    if (
        not unit
    ):  # .get() above should handle this but apparently the way kwargs is implemented
        unit = "w"  # unit could still be None here.
    try:
        results = align_audio(
            kwargs["textfile"],
            kwargs["audiofile"],
            unit=unit,
            bare=bare,
            config=config,
            save_temps=temp_base,
        )
    except RuntimeError as e:
        LOGGER.error(e)
        exit(1)

    if kwargs["text_grid"]:
        audio = read_audio_from_file(kwargs["audiofile"])
        duration = audio.frame_count() / audio.frame_rate
        words, sentences = return_words_and_sentences(results)
        textgrid = write_to_text_grid(words, sentences, duration)
        textgrid.to_file(output_base + ".TextGrid")
        textgrid.to_eaf().to_file(output_base + ".eaf")

    if kwargs["closed_captioning"]:
        words, sentences = return_words_and_sentences(results)
        webvtt_sentences = write_to_subtitles(sentences)
        webvtt_sentences.save(output_base + "_sentences.vtt")
        webvtt_sentences.save_as_srt(output_base + "_sentences.srt")
        webvtt_words = write_to_subtitles(words)
        webvtt_words.save(output_base + "_words.vtt")
        webvtt_words.save_as_srt(output_base + "_words.srt")

    if kwargs["output_xhtml"]:
        convert_to_xhtml(results["tokenized"])

    save_minimal_index_html(
        os.path.join(output_dir, "index.html"),
        os.path.basename(tokenized_xml_path),
        os.path.basename(smil_path),
        os.path.basename(audio_path),
    )

    save_xml(tokenized_xml_path, results["tokenized"])
    smil = make_smil(
        os.path.basename(tokenized_xml_path), os.path.basename(audio_path), results
    )
    shutil.copy(kwargs["audiofile"], audio_path)
    save_txt(smil_path, smil)
Exemple #12
0
def g2p(**kwargs):
    """Apply g2p mappings to TOKFILE into G2PFILE.

    TOKFILE should have been produced by 'readalongs tokenize'.
    G2PFILE can then be modified to adjust the phonetic representation as needed.
    'readalongs align' can be called with G2PFILE instead of TOKFILE as XML input.

    The g2p cascade will be enabled whenever an XML element or any of its
    ancestors in TOKFILE has the attribute "fallback-langs" containing a comma-
    or colon-separated list of language codes. Provide multiple language codes to
    "readalongs prepare" via its -l option to generate this attribute globally,
    or add it manually where needed. Undetermined, "und", is automatically
    added at the end of the language list provided via -l.

    With the g2p cascade, if a word cannot be mapped to valid ARPABET with the
    language found in the "xml:lang" attribute, the languages in
    "fallback-langs" are tried in order until a valid ARPABET mapping is
    generated.

    The output XML file can be used as input to align.

    TOKFILE: Path to the input tokenized XML file, or - for stdin

    G2PFILE: Output path for the g2p'd XML, or - for stdout [default: TOKFILE
    with .g2p. inserted]
    """
    if kwargs["debug"]:
        LOGGER.setLevel("DEBUG")
        LOGGER.info(
            "Running readalongs g2p(tokfile={}, g2pfile={}, force-overwrite={})."
            .format(
                kwargs["tokfile"],
                kwargs["g2pfile"],
                kwargs["force_overwrite"],
            ))

    input_file = kwargs["tokfile"]

    if not kwargs["g2pfile"]:
        output_path = get_click_file_name(input_file)
        if output_path != "-":
            if output_path.endswith(".xml"):
                output_path = output_path[:-4]
            if output_path.endswith(".tokenized"):
                output_path = output_path[:-len(".tokenized")]
            output_path += ".g2p.xml"
    else:
        output_path = kwargs["g2pfile"]
        if not output_path.endswith(".xml") and not output_path == "-":
            output_path += ".xml"

    if os.path.exists(output_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." %
            output_path)

    try:
        xml = etree.parse(input_file).getroot()
    except etree.XMLSyntaxError as e:
        raise click.BadParameter(
            "Error parsing input file %s as XML, please verify it. Parser error: %s"
            % (get_click_file_name(input_file), e))

    # Add the IDs to paragraph, sentences, word, etc.
    xml = add_ids(xml)

    # Apply the g2p mappings.
    xml, valid = convert_xml(
        xml,
        verbose_warnings=kwargs["g2p_verbose"],
    )

    if output_path == "-":
        write_xml(sys.stdout.buffer, xml)
    else:
        save_xml(output_path, xml)
        LOGGER.info("Wrote {}".format(output_path))

    if not valid:
        LOGGER.error("Some word(s) could not be g2p'd correctly." + (
            " Run again with --g2p-verbose to get more detailed error messages."
            if not kwargs["g2p_verbose"] else ""))
        sys.exit(1)