Ejemplo n.º 1
0
 def setUp(self):
     self.data_path = os.path.join(os.path.dirname(__file__), "data")
     self.audio_segment = read_audio_from_file(
         os.path.join(self.data_path, "audio_sample.ogg")
     )
     self.noisy_segment = read_audio_from_file(
         os.path.join(self.data_path, "noise_at_1500.mp3")
     )
     # Use a TemporaryDirectory object for temp outputs, so they get cleaned
     # automatically cleaned even in case of errors or aborted runs.
     self.tempdirobj = tempfile.TemporaryDirectory(
         prefix="test_audio_tmpdir", dir="."
     )
     self.tempdir = self.tempdirobj.name
Ejemplo n.º 2
0
 def test_write_audio_to_file(self):
     """Mininal unit testing for write_audio_to_file"""
     section = extract_section(self.audio_segment, 1000, 2000)
     output_path = os.path.join(self.tempdir, "section_output.mp3")
     write_audio_to_file(section, output_path)
     self.assertTrue(os.path.exists(output_path))
     reloaded_section = read_audio_from_file(output_path)
     self.assertAlmostEqual(
         len(section),
         len(reloaded_section),
         msg="reloaded audio file is more than 50ms shorter or longer",
         delta=50,
     )
Ejemplo n.º 3
0
def save_readalong(  # noqa C901
        # noqa C901 - ignore the complexity of this function
        # this * forces all arguments to be passed by name, because I don't want any
        # code to depend on their order in the future
        *,
        align_results: Dict[str, List],
        output_dir: str,
        output_basename: str,
        config=None,
        audiofile: str,
        audiosegment: AudioSegment = None,
        output_formats=(),
):
    """Save the results from align_audio() into the output files required for a
        readalong

    Args:
        align_results(Dict[str,List]): return value from align_audio()
        output_dir (str): directory where to save the readalong,
            output_dir should already exist, files it contains may be overwritten
        output_basename (str): basename of the files to save in output_dir
        config ([type TODO], optional): alignment configuration loaded from the json
        audiofile (str): path to the audio file passed to align_audio()
        output_formats (List[str], optional): list of desired output formats
        audiosegment (AudioSegment): a pydub.AudioSegment object of processed audio.
                              if None, then original audio will be saved at `audiofile`

    Returns:
        None

    Raises:
        [TODO]
    """
    # Round all times to three digits, anything more is excess precision
    # poluting the output files, and usually due to float rounding errors anyway.
    for w in align_results["words"]:
        w["start"] = round(w["start"], 3)
        w["end"] = round(w["end"], 3)

    output_base = os.path.join(output_dir, output_basename)

    # Create textgrid object if outputting to TextGrid or eaf
    if "TextGrid" in output_formats or "eaf" in output_formats:
        audio = read_audio_from_file(audiofile)
        duration = audio.frame_count() / audio.frame_rate
        words, sentences = return_words_and_sentences(align_results)
        textgrid = write_to_text_grid(words, sentences, duration)

        if "TextGrid" in output_formats:
            textgrid.to_file(output_base + ".TextGrid")

        if "eaf" in output_formats:
            textgrid.to_eaf().to_file(output_base + ".eaf")

    # Create webvtt object if outputting to vtt or srt
    if "srt" in output_formats or "vtt" in output_formats:
        words, sentences = return_words_and_sentences(align_results)
        cc_sentences = write_to_subtitles(sentences)
        cc_words = write_to_subtitles(words)

        if "srt" in output_formats:
            cc_sentences.save_as_srt(output_base + "_sentences.srt")
            cc_words.save_as_srt(output_base + "_words.srt")

        if "vtt" in output_formats:
            cc_words.save(output_base + "_words.vtt")
            cc_sentences.save(output_base + "_sentences.vtt")

    tokenized_xml_path = output_base + ".xml"
    save_xml(tokenized_xml_path, align_results["tokenized"])

    if "xhtml" in output_formats:
        convert_to_xhtml(align_results["tokenized"])
        tokenized_xhtml_path = output_base + ".xhtml"
        save_xml(tokenized_xhtml_path, align_results["tokenized"])

    _, audio_ext = os.path.splitext(audiofile)
    audio_path = output_base + audio_ext
    audio_format = audio_ext[1:]
    if audiosegment:
        if audio_format in ["m4a", "aac"]:
            audio_format = "ipod"
        try:
            audiosegment.export(audio_path, format=audio_format)
        except CouldntEncodeError:
            LOGGER.warning(f"The audio file at {audio_path} could \
                not be exported in the {audio_format} format. \
                Please ensure your installation of ffmpeg has \
                the necessary codecs.")
            audio_path = output_base + ".wav"
            audiosegment.export(audio_path, format="wav")
    else:
        shutil.copy(audiofile, audio_path)

    smil_path = output_base + ".smil"
    smil = make_smil(
        os.path.basename(tokenized_xml_path),
        os.path.basename(audio_path),
        align_results,
    )
    save_txt(smil_path, smil)

    if "html" in output_formats:
        html_out_path = output_base + ".html"
        html_out = create_web_component_html(tokenized_xml_path, smil_path,
                                             audio_path)
        with open(html_out_path, "w") as f:
            f.write(html_out)

    save_minimal_index_html(
        os.path.join(output_dir, "index.html"),
        os.path.basename(tokenized_xml_path),
        os.path.basename(smil_path),
        os.path.basename(audio_path),
    )

    # Copy the image files to the output's asset directory, if any are found
    if config and "images" in config:
        assets_dir = os.path.join(output_dir, "assets")
        try:
            os.mkdir(assets_dir)
        except FileExistsError:
            if not os.path.isdir(assets_dir):
                raise
        for _, image in config["images"].items():
            if image[0:4] == "http":
                LOGGER.warning(
                    f"Please make sure {image} is accessible to clients using your read-along."
                )
            else:
                try:
                    shutil.copy(image, assets_dir)
                except Exception as e:
                    LOGGER.warning(
                        f"Please copy {image} to {assets_dir} before deploying your read-along. ({e})"
                    )
                if os.path.basename(image) != image:
                    LOGGER.warning(
                        f"Read-along images were tested with absolute urls (starting with http(s):// "
                        f"and filenames without a path. {image} might not work as specified."
                    )
Ejemplo n.º 4
0
def align_audio(  # noqa: C901
    xml_path,
    audio_path,
    unit="w",
    bare=False,
    config=None,
    save_temps=None,
    verbose_g2p_warnings=False,
):
    """Align an XML input file to an audio file.

    Args:
        xml_path (str): Path to XML input file in TEI-like format
        audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
        unit (str): Optional; Element to create alignments for, by default 'w'
        bare (boolean): Optional;
            If False, split silence into adjoining tokens (default)
            If True, keep the bare tokens without adjoining silences.
        config (object): Optional; ReadAlong-Studio configuration to use
        save_temps (str): Optional; Save temporary files, by default None
        verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
            iff True

    Returns:
        Dict[str, List]: TODO

    Raises:
        TODO
    """
    results: Dict[str, List] = {"words": [], "audio": None}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." %
                           (xml_path, e)) from e
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)
    if not valid:
        raise RuntimeError(
            "Some words could not be g2p'd correctly. Aborting. "
            "Run with --g2p-verbose for more detailed g2p error logs.")

    # Prepare the SoundsSwallower (formerly PocketSphinx) configuration
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    # Read the audio file
    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    audio_length_in_ms = len(audio.raw_data)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio, silencing or removing any DNA segments
    dna_segments = []
    removed_segments = []
    if config and "do-not-align" in config:
        # Sort un-alignable segments and join overlapping ones
        dna_segments = sort_and_join_dna_segments(
            config["do-not-align"]["segments"])
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method in ("mute", "remove"):
            processed_audio = audio
            # Process the DNA segments in reverse order so we don't have to correct
            # for previously processed ones when using the "remove" method.
            for seg in reversed(dna_segments):
                processed_audio = dna_method(processed_audio,
                                             int(seg["begin"]),
                                             int(seg["end"]))
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(save_temps + "_processed" + ext,
                                           format=ext[1:])
                except CouldntEncodeError:
                    try:
                        os.remove(save_temps + "_processed" + ext)
                    except BaseException:
                        pass
                    LOGGER.warning(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
            removed_segments = dna_segments
        audio_data = processed_audio
    else:
        audio_data = audio

    # Initialize the SoundSwallower decoder with the sample rate from the audio
    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    frame_size = 1.0 / cfg.get_int("-frate")

    # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100),
    # while the audio segments manipulated using pydub are sliced and accessed in
    # millisecond intervals. For audio segments, the ms slice assumption is hard-coded
    # all over, while frames_to_time() is used to convert segment boundaries returned by
    # soundswallower, which are indexes in frames, into durations in seconds.
    def frames_to_time(frames):
        return frames * frame_size

    # Extract the list of sequences of words in the XML
    word_sequences = get_sequences(xml, xml_path, unit=unit)
    end = 0
    for i, word_sequence in enumerate(word_sequences):

        i_suffix = "" if i == 0 else "." + str(i + 1)

        # Generate dictionary and FSG for the current sequence of words
        dict_data = make_dict(word_sequence.words, xml_path, unit=unit)
        if save_temps:
            dict_file = io.open(save_temps + ".dict" + i_suffix, "wb")
        else:
            dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_",
                                                   delete=False)
        dict_file.write(dict_data.encode("utf-8"))
        dict_file.close()

        fsg_data = make_fsg(word_sequence.words, xml_path)
        if save_temps:
            fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb")
        else:
            fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_",
                                                  delete=False)
        fsg_file.write(fsg_data.encode("utf-8"))
        fsg_file.close()

        # Extract the part of the audio corresponding to this word sequence
        audio_segment = extract_section(audio_data, word_sequence.start,
                                        word_sequence.end)
        if save_temps and audio_segment is not audio_data:
            write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix)

        # Configure soundswallower for this sequence's dict and fsg
        cfg.set_string("-dict", dict_file.name)
        cfg.set_string("-fsg", fsg_file.name)
        ps = soundswallower.Decoder(cfg)
        # Align this word sequence
        ps.start_utt()
        ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True)
        ps.end_utt()

        if not ps.seg():
            raise RuntimeError(
                "Alignment produced no segments, "
                "please examine dictionary and input audio and text.")

        # List of removed segments for the sequence we are currently processing
        curr_removed_segments = dna_union(word_sequence.start,
                                          word_sequence.end,
                                          audio_length_in_ms, removed_segments)

        prev_segment_count = len(results["words"])
        for seg in ps.seg():
            if seg.word in ("<sil>", "[NOISE]"):
                continue
            start = frames_to_time(seg.start_frame)
            end = frames_to_time(seg.end_frame + 1)
            # change to ms
            start_ms = start * 1000
            end_ms = end * 1000
            if curr_removed_segments:
                start_ms += calculate_adjustment(start_ms,
                                                 curr_removed_segments)
                end_ms += calculate_adjustment(end_ms, curr_removed_segments)
                start_ms, end_ms = correct_adjustments(start_ms, end_ms,
                                                       curr_removed_segments)
                # change back to seconds to write to smil
                start = start_ms / 1000
                end = end_ms / 1000
            results["words"].append({
                "id": seg.word,
                "start": start,
                "end": end
            })
            LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)
        aligned_segment_count = len(results["words"]) - prev_segment_count
        if aligned_segment_count != len(word_sequence.words):
            LOGGER.warning(
                f"Word sequence {i+1} had {len(word_sequence.words)} tokens "
                f"but produced {aligned_segment_count} segments. "
                "Check that the anchors are well positioned or "
                "that the audio corresponds to the text.")
    final_end = end

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please verify that the text is an actual transcript of the audio."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        LOGGER.warning(
            "Alignment produced a different number of segments and tokens than "
            "were in the input. Sequences between some anchors probably did not "
            "align successfully. Look for more anchors-related warnings above in the log."
        )

    if not bare:
        # Take all the boundaries (anchors) around segments and add them as DNA
        # segments for the purpose of splitting silences
        dna_for_silence_splitting = copy.deepcopy(dna_segments)
        last_end = None
        for seq in word_sequences:
            if last_end or seq.start:
                dna_for_silence_splitting.append({
                    "begin": (last_end or seq.start),
                    "end": (seq.start or last_end)
                })
            last_end = seq.end
        if last_end:
            dna_for_silence_splitting.append({
                "begin": last_end,
                "end": last_end
            })
        dna_for_silence_splitting = sort_and_join_dna_segments(
            dna_for_silence_splitting)

        split_silences(results["words"], final_end, dna_for_silence_splitting)
    words_dict = {
        x["id"]: {
            "start": x["start"],
            "end": x["end"]
        }
        for x in results["words"]
    }
    silence_offsets = defaultdict(int)
    silence = 0
    if results["tokenized"].xpath("//silence"):
        endpoint = 0
        all_good = True
        for el in results["tokenized"].xpath("//*"):
            if el.tag == "silence" and "dur" in el.attrib:
                try:
                    silence_ms = parse_time(el.attrib["dur"])
                except ValueError as err:
                    LOGGER.error(
                        f'Invalid silence element in {xml_path}: invalid "time" '
                        f'attribute "{el.attrib["dur"]}": {err}')
                    all_good = False
                    continue
                silence_segment = AudioSegment.silent(
                    duration=silence_ms)  # create silence segment
                silence += silence_ms  # add silence length to total silence
                audio = (audio[:endpoint] + silence_segment + audio[endpoint:]
                         )  # insert silence at previous endpoint
                endpoint += silence_ms  # add silence to previous endpoint
            if el.tag == "w":
                silence_offsets[el.attrib["id"]] += (
                    silence / 1000
                )  # add silence in seconds to silence offset for word id
                endpoint = (words_dict[el.attrib["id"]]["end"] * 1000
                            ) + silence  # bump endpoint and include silence
        if not all_good:
            raise RuntimeError(
                f"Could not parse all duration attributes in silence elements in {xml_path}, please make sure each silence "
                'element is properly formatted, e.g., <silence dur="1.5s"/>.  Aborting.'
            )
    if silence:
        for word in results["words"]:
            word["start"] += silence_offsets[word["id"]]
            word["end"] += silence_offsets[word["id"]]
        results["audio"] = audio
    return results
Ejemplo n.º 5
0
def align_audio(
    xml_path, audio_path, unit="w", bare=False, config=None, save_temps=None,
):
    """ Align an XML input file to an audio file.

    Parameters
    ----------
    xml_path : str
        Path to XML input file in TEI-like format
    audio_path : str
        Path to audio input. Must be in a format supported by ffmpeg
    unit : str, optional
        Element to create alignments for, by default 'w'
    bare : boolean, optional
        If False, split silence into adjoining tokens (default)
        If True, keep the bare tokens without adjoining silences.
    config : object, optional
        Uses ReadAlong-Studio configuration
    save_temps : Union[str, None], optional
        save temporary files, by default None

    #TODO: document return
    Returns
    -------
    [type]
        [description]

    #TODO: document exceptions
    Raises
    ------
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    RuntimeError
        [description]
    """
    results: Dict[str, List] = {"words": []}

    # First do G2P
    try:
        xml = etree.parse(xml_path).getroot()
    except etree.XMLSyntaxError as e:
        raise RuntimeError("Error parsing XML input file %s: %s." % (xml_path, e))
    if config and "images" in config:
        xml = add_images(xml, config)
    if config and "xml" in config:
        xml = add_supplementary_xml(xml, config)
    xml = add_lang_ids(xml, unit="s")
    xml = tokenize_xml(xml)
    if save_temps:
        save_xml(save_temps + ".tokenized.xml", xml)
    results["tokenized"] = xml = add_ids(xml)
    if save_temps:
        save_xml(save_temps + ".ids.xml", xml)
    xml = convert_xml(xml)
    if save_temps:
        save_xml(save_temps + ".g2p.xml", xml)

    # Now generate dictionary and FSG
    dict_data = make_dict(xml, xml_path, unit=unit)
    if save_temps:
        dict_file = io.open(save_temps + ".dict", "wb")
    else:
        dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=False)
    dict_file.write(dict_data.encode("utf-8"))
    dict_file.flush()
    fsg_data = make_fsg(xml, xml_path, unit=unit)
    if save_temps:
        fsg_file = io.open(save_temps + ".fsg", "wb")
    else:
        fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=False)
    fsg_file.write(fsg_data.encode("utf-8"))
    fsg_file.flush()

    # Now do alignment
    cfg = soundswallower.Decoder.default_config()
    model_path = soundswallower.get_model_path()
    cfg.set_boolean("-remove_noise", False)
    cfg.set_boolean("-remove_silence", False)
    cfg.set_string("-hmm", os.path.join(model_path, "en-us"))
    cfg.set_string("-dict", dict_file.name)
    cfg.set_string("-fsg", fsg_file.name)
    # cfg.set_string('-samprate', "no no")
    cfg.set_float("-beam", 1e-100)
    cfg.set_float("-wbeam", 1e-80)

    audio = read_audio_from_file(audio_path)
    audio = audio.set_channels(1).set_sample_width(2)
    #  Downsampling is (probably) not necessary
    cfg.set_float("-samprate", audio.frame_rate)

    # Process audio
    do_not_align_segments = None
    if config and "do-not-align" in config:
        # Reverse sort un-alignable segments
        do_not_align_segments = sorted(
            config["do-not-align"]["segments"], key=lambda x: x["begin"], reverse=True
        )
        method = config["do-not-align"].get("method", "remove")
        # Determine do-not-align method
        if method == "mute":
            dna_method = mute_section
        elif method == "remove":
            dna_method = remove_section
        else:
            LOGGER.error("Unknown do-not-align method declared")
        # Process audio and save temporary files
        if method == "mute" or method == "remove":
            processed_audio = audio
            for seg in do_not_align_segments:
                processed_audio = dna_method(
                    processed_audio, int(seg["begin"]), int(seg["end"])
                )
            if save_temps:
                _, ext = os.path.splitext(audio_path)
                try:
                    processed_audio.export(
                        save_temps + "_processed" + ext, format=ext[1:]
                    )
                except CouldntEncodeError:
                    os.remove(save_temps + "_processed" + ext)
                    LOGGER.warn(
                        f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'"
                    )
                    processed_audio.export(save_temps + "_processed" + ".wav")
        raw_data = processed_audio.raw_data
    else:
        raw_data = audio.raw_data

    frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen"))
    fft_size = 1
    while fft_size < frame_points:
        fft_size = fft_size << 1
    cfg.set_int("-nfft", fft_size)
    ps = soundswallower.Decoder(cfg)
    frame_size = 1.0 / cfg.get_int("-frate")

    def frames_to_time(frames):
        return frames * frame_size

    ps.start_utt()
    ps.process_raw(raw_data, no_search=False, full_utt=True)
    ps.end_utt()

    if not ps.seg():
        raise RuntimeError(
            "Alignment produced no segments, "
            "please examine dictionary and input audio and text."
        )

    for seg in ps.seg():
        start = frames_to_time(seg.start_frame)
        end = frames_to_time(seg.end_frame + 1)
        # change to ms
        start_ms = start * 1000
        end_ms = end * 1000
        if do_not_align_segments and method == "remove":
            start_ms += calculate_adjustment(start_ms, do_not_align_segments)
            end_ms += calculate_adjustment(end_ms, do_not_align_segments)
            start_ms, end_ms = correct_adjustments(
                start_ms, end_ms, do_not_align_segments
            )
            # change back to seconds to write to smil
            start = start_ms / 1000
            end = end_ms / 1000
        if seg.word in ("<sil>", "[NOISE]"):
            continue
        else:
            results["words"].append({"id": seg.word, "start": start, "end": end})
        LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end)

    if len(results["words"]) == 0:
        raise RuntimeError(
            "Alignment produced only noise or silence segments, "
            "please examine dictionary and input audio and text."
        )
    if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)):
        raise RuntimeError(
            "Alignment produced a different number of segments and tokens, "
            "please examine dictionary and input audio and text."
        )

    final_end = end

    if not bare:
        # Split adjoining silence/noise between words
        last_end = 0.0
        last_word = dict()
        for word in results["words"]:
            silence = word["start"] - last_end
            midpoint = last_end + silence / 2
            if silence > 0:
                if last_word:
                    last_word["end"] = midpoint
                word["start"] = midpoint
            last_word = word
            last_end = word["end"]
        silence = final_end - last_end
        if silence > 0:
            if last_word is not None:
                last_word["end"] += silence / 2
    dict_file.close()
    if not save_temps:
        os.unlink(dict_file.name)
    fsg_file.close()
    if not save_temps:
        os.unlink(fsg_file.name)

    return results
Ejemplo n.º 6
0
def align(**kwargs):
    """Align TEXTFILE and AUDIOFILE and create output files as OUTPUT_BASE.* in directory
    OUTPUT_BASE/.

    TEXTFILE:    Input text file path (in XML, or plain text with -i)

    AUDIOFILE:   Input audio file path, in any format supported by ffmpeg

    OUTPUT_BASE: Base name for output files
    """
    config = kwargs.get("config", None)
    if config:
        if config.endswith("json"):
            try:
                with open(config) as f:
                    config = json.load(f)
            except json.decoder.JSONDecodeError:
                LOGGER.error(f"Config file at {config} is not valid json.")
        else:
            raise click.BadParameter(f"Config file '{config}' must be in JSON format")

    output_dir = kwargs["output_base"]
    if os.path.exists(output_dir):
        if not os.path.isdir(output_dir):
            raise click.UsageError(
                f"Output folder '{output_dir}' already exists but is a not a directory."
            )
        if not kwargs["force_overwrite"]:
            raise click.UsageError(
                f"Output folder '{output_dir}' already exists, use -f to overwrite."
            )
    else:
        os.mkdir(output_dir)

    # Make sure we can write to the output directory, for early error checking and user
    # friendly error messages.
    try:
        with TemporaryFile(dir=output_dir):
            pass
    except Exception:
        raise click.UsageError(
            f"Cannot write into output folder '{output_dir}'. Please verify permissions."
        )

    output_basename = os.path.basename(output_dir)
    output_base = os.path.join(output_dir, output_basename)
    temp_base = None
    if kwargs["save_temps"]:
        temp_dir = os.path.join(output_dir, "tempfiles")
        if not os.path.isdir(temp_dir):
            if os.path.exists(temp_dir) and kwargs["force_overwrite"]:
                os.unlink(temp_dir)
            os.mkdir(temp_dir)
        temp_base = os.path.join(temp_dir, output_basename)

    if kwargs["debug"]:
        LOGGER.setLevel("DEBUG")
    if kwargs["text_input"]:
        if not kwargs["language"]:
            LOGGER.warn("No input language provided, using undetermined mapping")
        tempfile, kwargs["textfile"] = create_input_tei(
            input_file_name=kwargs["textfile"],
            text_language=kwargs["language"],
            save_temps=temp_base,
        )
    if kwargs["output_xhtml"]:
        tokenized_xml_path = "%s.xhtml" % output_base
    else:
        _, input_ext = os.path.splitext(kwargs["textfile"])
        tokenized_xml_path = "%s%s" % (output_base, input_ext)
    if os.path.exists(tokenized_xml_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." % tokenized_xml_path
        )
    smil_path = output_base + ".smil"
    if os.path.exists(smil_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." % smil_path
        )
    _, audio_ext = os.path.splitext(kwargs["audiofile"])
    audio_path = output_base + audio_ext
    if os.path.exists(audio_path) and not kwargs["force_overwrite"]:
        raise click.BadParameter(
            "Output file %s exists already, use -f to overwrite." % audio_path
        )
    unit = kwargs.get("unit", "w")
    bare = kwargs.get("bare", False)
    if (
        not unit
    ):  # .get() above should handle this but apparently the way kwargs is implemented
        unit = "w"  # unit could still be None here.
    try:
        results = align_audio(
            kwargs["textfile"],
            kwargs["audiofile"],
            unit=unit,
            bare=bare,
            config=config,
            save_temps=temp_base,
        )
    except RuntimeError as e:
        LOGGER.error(e)
        exit(1)

    if kwargs["text_grid"]:
        audio = read_audio_from_file(kwargs["audiofile"])
        duration = audio.frame_count() / audio.frame_rate
        words, sentences = return_words_and_sentences(results)
        textgrid = write_to_text_grid(words, sentences, duration)
        textgrid.to_file(output_base + ".TextGrid")
        textgrid.to_eaf().to_file(output_base + ".eaf")

    if kwargs["closed_captioning"]:
        words, sentences = return_words_and_sentences(results)
        webvtt_sentences = write_to_subtitles(sentences)
        webvtt_sentences.save(output_base + "_sentences.vtt")
        webvtt_sentences.save_as_srt(output_base + "_sentences.srt")
        webvtt_words = write_to_subtitles(words)
        webvtt_words.save(output_base + "_words.vtt")
        webvtt_words.save_as_srt(output_base + "_words.srt")

    if kwargs["output_xhtml"]:
        convert_to_xhtml(results["tokenized"])

    save_minimal_index_html(
        os.path.join(output_dir, "index.html"),
        os.path.basename(tokenized_xml_path),
        os.path.basename(smil_path),
        os.path.basename(audio_path),
    )

    save_xml(tokenized_xml_path, results["tokenized"])
    smil = make_smil(
        os.path.basename(tokenized_xml_path), os.path.basename(audio_path), results
    )
    shutil.copy(kwargs["audiofile"], audio_path)
    save_txt(smil_path, smil)
Ejemplo n.º 7
0
 def setUp(self):
     super().setUp()
     self.audio_segment = read_audio_from_file(
         os.path.join(self.data_dir, "audio_sample.ogg"))
     self.noisy_segment = read_audio_from_file(
         os.path.join(self.data_dir, "noise_at_1500.mp3"))