예제 #1
0
    def prepare_for_training(self, target, sample_rate=16000, nested=False):
        """
        Prepare one exemplar for training
        Returning a new exemplar object with updated file locations
        and a resampled audio_file
        """
        if nested:
            af_target_file = os.path.join(target, "sph",
                                          basename(self.audio_file.location))
            tf_target_file = os.path.join(
                target, "stm", basename(self.transcript_file.location))
        else:
            af_target_file = os.path.join(target,
                                          basename(self.audio_file.location))
            tf_target_file = os.path.join(
                target, basename(self.transcript_file.location))

        af = self.audio_file.prepare_for_training(
            af_target_file,
            sample_rate=sample_rate,
        )

        tf = self.transcript_file.write(tf_target_file)

        return exemplar({
            "audio_file": af,
            "transcript_file": tf
        }) if all([af, tf]) else None
예제 #2
0
    def validate(self):
        " validate exemplar object by constraining that the filenames before the extension are the same "

        audio_filename = basename(strip_extension(self.audio_file.location))
        transcript_filename = basename(
            strip_extension(self.transcript_file.location))

        # Audio and transcript filename must match
        # Audio file must not be empty
        # Transcript file must not be empty
        valid = (audio_filename == transcript_filename
                 and os.path.getsize(self.audio_file.location)
                 and os.path.getsize(self.transcript_file.location))

        return valid
예제 #3
0
def align_json(ref_txt, json_file, filename=None):
    """
    CLI for forced alignment tools
    Using a reference txt file and a hypothesis gk json
        file, this time-aligns the reference txt file
        and outputs an STM file
    Input
      ref_txt, str - reference text file containing ground truth
      json_file, str - hypothesis gk JSON file
      filename, str - output STM filename
    """

    ref_tokens = preprocess_txt.parse_transcript(ref_txt)
    gk_json = preprocess_gk_json.preprocess_transcript(json_file)
    segments = align(gk_json, ref_tokens)

    if filename is None:
        filename = basename(sanitize(strip_extension(ref_txt))) + ".stm"

    # fix segment filename and speaker
    for seg in segments:
        seg.filename = strip_extension(filename)
        seg.speaker = strip_extension(filename) + "UnknownSpeaker"

    output = time_aligned_text()
    output.segments = segments
    output.write(filename)
예제 #4
0
    def __init__(self, *args, **kwargs):
        """
        Initialize from location and populate list of
        SPH, WAV, or MP3 audio files
        and STM files into segments
        """
        for dictionary in args:
            if isinstance(dictionary, dict):
                for key in dictionary:
                    setattr(self, key, dictionary[key])
        for key in kwargs:
            setattr(self, key, kwargs[key])

        # only if not defined above should we search for exemplars
        # based on location
        if not self.exemplars:
            # instantiate exemplars for this object to override
            # static class variable
            self.exemplars = []

            audio_extensions_to_try = ["sph", "wav", "mp3"][::-1]
            self.exemplars += [
                exemplar({
                    "audio_file":
                    audio_file(fl),
                    "transcript_file":
                    time_aligned_text(strip_extension(fl) + ".stm"),
                }) for audio_extension in audio_extensions_to_try
                for fl in (get_files(self.location, audio_extension) if self.
                           location else [])
                if (os.path.exists(strip_extension(fl) + ".stm"))
            ]

            # gather all exemplars from /stm and /sph subdirectories if present
            self.exemplars += [
                exemplar({
                    "audio_file":
                    audio_file(fl),
                    "transcript_file":
                    time_aligned_text(self.location + "/stm/" +
                                      basename(strip_extension(fl)) + ".stm"),
                }) for audio_extension in audio_extensions_to_try for fl in
                (get_files(self.location +
                           "/sph/", audio_extension) if self.location else [])
                if (os.path.exists(self.location + "/stm/" +
                                   basename(strip_extension(fl)) + ".stm"))
            ]
예제 #5
0
    def prepare_for_training(self,
                             target=None,
                             nested=False,
                             sample_rate=16000):
        """
        Run validation and audio file preparation steps
        """

        # write corpus back in place if no target
        target = self.location if target is None else target

        executor = ThreadPoolExecutor()

        # process audio files concurrently for speed
        futures = [
            executor.submit(
                partial(_.audio_file.prepare_for_training,
                        file_name=target + ("/sph/" if nested else "/") +
                        basename(_.audio_file.location),
                        sample_rate=sample_rate)) for _ in self.exemplars
        ]

        # trigger conversion and gather results
        audio_files = [future.result() for future in tqdm(futures)]

        transcript_files = [
            _.transcript_file.write(target + ("/stm/" if nested else "/") +
                                    basename(_.transcript_file.location))
            for _ in self.exemplars
        ]

        new_corpus = corpus({
            "location":
            target,
            "exemplars": [
                exemplar({
                    "audio_file": af,
                    "transcript_file": tf
                }) for af, tf in zip(audio_files, transcript_files)
            ],
        })
        new_corpus.validate()
        return new_corpus.log()
예제 #6
0
    def validate(self):
        """
        Validates exemplar object by constraining that the filenames before the
        extension are the same
        """

        audio_filename = basename(strip_extension(self.audio_file.location))
        transcript_filename = basename(
            strip_extension(self.transcript_file.location))

        # Audio and transcript filename must match
        # Audio file must not be empty
        # Transcript file must not be empty
        valid = (audio_filename == transcript_filename
                 and os.path.getsize(self.audio_file.location)
                 and os.path.getsize(self.transcript_file.location))
        # This returns an integer corresponding to the output of the last condition, not a boolean.
        # Thats just how `and` works in python

        return bool(valid)
def extract_xlsx(filename, target_folder):
    """
    For an excel spreadsheet, extract to a text file
    """
    working_excel_data_structure = pd.ExcelFile(filename)
    raw_name = sanitize(strip_extension(basename(filename)))

    with open(''.join([target_folder, '/', raw_name, ".txt"]),
              'a+') as output_file:
        for sheet in working_excel_data_structure.sheet_names:
            dump_sheet(output_file,
                       working_excel_data_structure.parse(sheet).values)