def test_json_to_stm_conversion(): " execute json to stm tests " transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.json") convert_and_test_it_loads(transcript, f"{test_dir}/json_to_stm_test_1.stm") transcript = time_aligned_text(f"{sample_dir}/simple_test.json") convert_and_test_it_loads(transcript, f"{test_dir}/json_to_stm_test_2.stm")
def test_conversion(): " execute single test " reference_file = time_aligned_text("../samples/BillGatesTEDTalk.stm") transcript_file = time_aligned_text("../samples/BillGatesTEDTalk_intentionally_poor_transcription.txt") # test fixed precision output of wer calculation assert "{:5.3f}".format(wer(reference_file.text(), transcript_file.text(), True)) == "3.332"
def convert_and_test_it_loads(transcript_obj, output_filename): """ Tests that conversion works Tests that the file can reload Removes transitory file """ transcript_obj.write(output_filename) time_aligned_text(output_filename) os.remove(output_filename)
def align_json(ref_txt, json_file, filename=None): """ CLI for forced alignment tools Using a reference txt file and a hypothesis gk json file, this time-aligns the reference txt file and outputs an STM file Input ref_txt, str - reference text file containing ground truth json_file, str - hypothesis gk JSON file filename, str - output STM filename """ ref_tokens = preprocess_txt.parse_transcript(ref_txt) gk_json = preprocess_gk_json.preprocess_transcript(json_file) segments = align(gk_json, ref_tokens) if filename is None: filename = basename(sanitize(strip_extension(ref_txt))) + ".stm" # fix segment filename and speaker for seg in segments: seg.filename = strip_extension(filename) seg.speaker = strip_extension(filename) + "UnknownSpeaker" output = time_aligned_text() output.segments = segments output.write(filename)
def test_json_to_rttm_conversion_without_speaker(): """ execute json to rttm test """ transcript = time_aligned_text(f"{test_dir}/no_speaker.json") convert_and_test_it_loads(transcript, f"{test_dir}/no_speaker.rttm")
def split_audio_file(source_audio_file, source_transcript, target_directory): """ Execute the split logic """ source_audio = audio_file(source_audio_file) transcript = time_aligned_text(source_transcript) source_audio.split(transcript, target_directory)
def validate_sample(ext, expected_transcripts, out_segments): base_output = f"{test_dir}/good" convert(f"{sample_dir}/invalid.stm", base_output + ext) validated_transcript = time_aligned_text(base_output + ext) assert len(validated_transcript.segments) == out_segments for seg, expected_text in zip(validated_transcript.segments, expected_transcripts): assert seg.text == expected_text
def test_json_to_stm_conversion(): " execute json to stm tests " input_file = time_aligned_text("../samples/BillGatesTEDTalk.json") reference_sha = hashlib.sha1( open("../samples/BillGatesTEDTalk_transcribed.stm", 'r', encoding='utf8').read().encode() ).hexdigest() input_file.write("json_to_stm_test_1.stm") new_sha = hashlib.sha1(open("json_to_stm_test_1.stm", 'r', encoding='utf8').read().encode()).hexdigest() assert reference_sha == new_sha input_file = time_aligned_text("../samples/simple_test.json") reference_sha = hashlib.sha1(open("../samples/simple_test.stm", 'r', encoding='utf8').read().encode()).hexdigest() input_file.write("json_to_stm_test_2.stm") new_sha = hashlib.sha1(open("json_to_stm_test_2.stm", 'r', encoding='utf8').read().encode()).hexdigest() assert reference_sha == new_sha
def __init__(self, *args, **kwargs): """ Initialize from location and populate list of SPH, WAV, or MP3 audio files and STM files into segments """ for dictionary in args: if isinstance(dictionary, dict): for key in dictionary: setattr(self, key, dictionary[key]) for key in kwargs: setattr(self, key, kwargs[key]) # only if not defined above should we search for exemplars # based on location if not self.exemplars: # instantiate exemplars for this object to override # static class variable self.exemplars = [] audio_extensions_to_try = ["sph", "wav", "mp3"][::-1] self.exemplars += [ exemplar({ "audio_file": audio_file(fl), "transcript_file": time_aligned_text(strip_extension(fl) + ".stm"), }) for audio_extension in audio_extensions_to_try for fl in (get_files(self.location, audio_extension) if self. location else []) if (os.path.exists(strip_extension(fl) + ".stm")) ] # gather all exemplars from /stm and /sph subdirectories if present self.exemplars += [ exemplar({ "audio_file": audio_file(fl), "transcript_file": time_aligned_text(self.location + "/stm/" + basename(strip_extension(fl)) + ".stm"), }) for audio_extension in audio_extensions_to_try for fl in (get_files(self.location + "/sph/", audio_extension) if self.location else []) if (os.path.exists(self.location + "/stm/" + basename(strip_extension(fl)) + ".stm")) ]
def test_json_to_txt_conversion(): " execute json to txt test " input_file = time_aligned_text("../samples/simple_test.json") reference_sha = hashlib.sha1(open("../samples/simple_test.txt", 'r', encoding='utf8').read().encode()).hexdigest() input_file.write("json_to_txt_test.txt") new_sha = hashlib.sha1(open("json_to_txt_test.txt", 'r', encoding='utf8').read().encode()).hexdigest() assert reference_sha == new_sha
def validate_sample(ext, expected_transcripts, out_segments): base_output = 'tests/good' convert('samples/invalid.stm', base_output + ext) validated_transcript = time_aligned_text(base_output + ext) assert len(validated_transcript.segments) == out_segments for seg, expected_text in zip(validated_transcript.segments, expected_transcripts): assert seg.text == expected_text
def test_txt_initialization(): " execute single test " input_dict = json.load(open("samples/BillGatesTEDTalk.json")) text = time_aligned_text(input_dict) text.file_extension = "txt" text_object = time_aligned_text(text.__str__()) reference_sha = hashlib.sha1( open("samples/BillGatesTEDTalk_transcribed.txt", "r", encoding="utf8").read().encode()).hexdigest() text_object.write("tests/file_conversion_test.txt") new_sha = hashlib.sha1( open("tests/file_conversion_test.txt", "r", encoding="utf8").read().encode()).hexdigest() assert reference_sha == new_sha
def split_audio_file(source_audio_file, source_transcript, target_directory): """ Split source audio file into segments denoted by transcript file into target_directory Results in stm and sph files in target directory """ source_audio = audio_file(source_audio_file) transcript = time_aligned_text(source_transcript) source_audio.split(transcript, target_directory)
def test_stm_to_html_conversion(): " execute stm to html test " input_file = time_aligned_text("../samples/BillGatesTEDTalk.stm") input_file.write("stm_to_html_test.html") reference_sha = hashlib.sha1(open("../samples/BillGatesTEDTalk.html", 'r', encoding='utf8').read().encode()).hexdigest() new_sha = hashlib.sha1(open("stm_to_html_test.html", 'r', encoding='utf8').read().encode()).hexdigest() assert reference_sha == new_sha
def main(): parser = argparse.ArgumentParser(description='convert between text file formats') parser.add_argument('input_file', metavar='input_file', type=str, help='input stm file') args = parser.parse_args() # after reading in, only valid lines will remain input_file = time_aligned_text(args.input_file) # write back to original file name input_file.write(args.input_file)
def test_stm_to_txt_conversion(): " execute stm to txt test " input_file = time_aligned_text("samples/BillGatesTEDTalk.stm") input_file.write("tests/stm_to_txt_test.txt") reference_sha = hashlib.sha1( open("samples/BillGatesTEDTalk.txt", "r", encoding="utf8").read().encode()).hexdigest() new_sha = hashlib.sha1( open("tests/stm_to_txt_test.txt", "r", encoding="utf8").read().encode()).hexdigest() assert reference_sha == new_sha
def test_conversion(): " execute single test " from asrtoolkit.data_structures.time_aligned_text import time_aligned_text input_file = time_aligned_text("../samples/BillGatesTEDTalk.stm") input_file.write("file_conversion_test.txt") reference_sha = hashlib.sha1( open("../samples/BillGatesTEDTalk.txt", 'r', encoding='utf8').read().encode()).hexdigest() new_sha = hashlib.sha1( open("file_conversion_test.txt", 'r', encoding='utf8').read().encode()).hexdigest() assert reference_sha == new_sha
def test_conversion(): " execute single test " from asrtoolkit.data_structures.time_aligned_text import time_aligned_text input_file = time_aligned_text("../samples/BillGatesTEDTalk.stm") input_file.write("file_conversion_test.txt") reference_sha = subprocess.Popen( ["sha1sum", "../samples/BillGatesTEDTalk.txt"], stdout=subprocess.PIPE).stdout.read().decode().split()[0] new_sha = subprocess.Popen( ["sha1sum", "file_conversion_test.txt"], stdout=subprocess.PIPE).stdout.read().decode().split()[0] assert reference_sha == new_sha
def test_json_initialization(): " execute single test " input_dict = json.load(open("samples/BillGatesTEDTalk.json")) text_object = time_aligned_text(input_dict) reference_sha = hashlib.sha1( open("samples/BillGatesTEDTalk_transcribed.stm", 'r', encoding='utf8').read().encode()).hexdigest() text_object.write("tests/file_conversion_test.stm") new_sha = hashlib.sha1( open("tests/file_conversion_test.stm", 'r', encoding='utf8').read().encode()).hexdigest() assert reference_sha == new_sha
def main(): parser = argparse.ArgumentParser( description='convert between text file formats') parser.add_argument('input_file', metavar='input_file', type=str, help='input file') parser.add_argument('output_file', metavar='output_file', type=str, help='output file') args = parser.parse_args() input_file = time_aligned_text(args.input_file) input_file.write(args.output_file)
def main(): parser = argparse.ArgumentParser( description='Compares a reference and transcript file and calculates word error rate (WER) between these two files' ) parser.add_argument('reference_file', metavar='reference_file', type=str, help='reference "truth" file') parser.add_argument( 'transcript_file', metavar='transcript_file', type=str, help='transcript possibly containing errors' ) parser.add_argument( "--char-level", help="calculate character error rate instead of word error rate", action="store_true" ) parser.add_argument("--ignore-nsns", help="ignore non silence noises like um, uh, etc.", action="store_true") # parse arguments args = parser.parse_args() # read files from arguments ref = time_aligned_text(args.reference_file) hyp = time_aligned_text(args.transcript_file) if args.char_level: print("CER: {:5.3f}%".format(cer(ref, hyp, args.ignore_nsns))) else: print("WER: {:5.3f}%".format(wer(ref, hyp, args.ignore_nsns)))
def __init__(self, input_dict=None): """ Initialize from location and populate list of SPH and STM files into segments """ self.__dict__.update(input_dict if input_dict else {}) if not self.exemplars: audio_files = [ audio_file(_) for _ in sorted(get_files(self.location, "sph")) ] transcript_files = [ time_aligned_text(_) for _ in sorted(get_files(self.location, "stm")) ] self.exemplars = [ exemplar({ "audio_file": af, "transcript_file": tf }) for af, tf in zip(audio_files, transcript_files) ]
def test_json_to_rttm_conversion(): """ execute json to rttm test """ transcript = time_aligned_text(f"{sample_dir}/simple_test.json") convert_and_test_it_loads(transcript, f"{test_dir}/json_to_rttm_test.rttm")
def test_json_to_txt_conversion(): " execute json to txt test " transcript = time_aligned_text(f"{sample_dir}/simple_test.json") convert_and_test_it_loads(transcript, f"{test_dir}/json_to_txt_test.txt")
def test_stm_to_srt_conversion(): " execute stm to srt test " transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.stm") convert_and_test_it_loads(transcript, f"{test_dir}/stm_to_srt_test.srt")
def check_transcript(transcript): if valid_input_file(transcript): return time_aligned_text(input_data=transcript) else: LOGGER.error("Invalid transcript file {}".format(transcript)) sys.exit(1)
def assign_if_valid(file_name): from asrtoolkit.data_structures.time_aligned_text import time_aligned_text " returns a time_aligned_text object if valid else None" return time_aligned_text(file_name) if valid_input_file(file_name) else None