def main(reference, output, reference_file, output_file, alignment, error_type, output_format, digit): """ Transcription compare tool provided by VoiceGain """ if reference is not None: reference = reference elif reference_file is not None: # with open(reference_file, 'r') as file1: reference = reference_file.read() else: raise ValueError( "One of --reference and --reference_file must be specified") if output is not None: output = output elif output_file is not None: # with open(output_file, 'r') as file2: output = output_file.read() else: raise ValueError("One of --output and --output_file must be specified") if error_type == "CER": calculator = UKKLevenshteinDistanceCalculator( tokenizer=CharacterTokenizer(), get_alignment_result=alignment) else: calculator = UKKLevenshteinDistanceCalculator( tokenizer=WordTokenizer(), get_alignment_result=alignment) if output_format == 'TABLE': alignment_result = calculator.get_distance(reference, output).alignment_result error_list = alignment_result.get_error_section_list() for e in error_list: print("+++++++++++++++") print(e.original_alignment_result) # updated_alignment_result = update_alignment_result(e.original_alignment_result) updated_alignment_result = update_alignment_result( e.original_alignment_result) e.set_correction(updated_alignment_result) alignment_result.apply_error_section_list(error_list) click.echo(alignment_result) if output_format == 'JSON': alignment_result = calculator.get_distance(reference, output).alignment_result error_list = alignment_result.get_error_section_list() for e in error_list: print("+++++++++++++++") print(e.original_alignment_result) # updated_alignment_result = update_alignment_result(e.original_alignment_result) updated_alignment_result = update_alignment_result( e.original_alignment_result) e.set_correction(updated_alignment_result) alignment_result.apply_error_section_list(error_list) distance = alignment_result.calculate_three_kinds_of_distance()[0] click.echo(alignment_result.to_json())
def update_alignment_result_error_section(self, alignment_result_error_section): alignment_result = alignment_result_error_section.original_alignment_result # alignment_result = result.alignment_result aligned_tokens_list = alignment_result.aligned_tokens_list calculator = UKKLevenshteinDistanceCalculator( tokenizer=WordTokenizer(), get_alignment_result=False) output_string = alignment_result.get_outputs_str() # original_ref_string = alignment_result.get_reference_str() # print("++++++++++++++++before calculate three in DU") old_distance = alignment_result.calculate_three_kinds_of_distance()[0] generator = SimpleReferenceCombinationGenerator() tmp_result = None for index in range(0, len(alignment_result)): # if aligned_tokens_list[index].reference.isdigit() is True: result_digit = self.our_is_digit( aligned_tokens_list[index].reference) if result_digit is not False: for r in result_digit: generator.add_new_token_options(r) else: generator.add_new_token_options( [aligned_tokens_list[index].reference]) # print('generator.get_all_reference()', generator.get_all_reference()) for x in generator.get_all_reference(): x = " ".join(x) distance = calculator.get_distance(x, output_string).distance # print('x', x) # print('output_string', output_string) # print('distance', distance) if distance < old_distance: old_distance = distance tmp_result = x if tmp_result is None: return None # else: # if original_ref_string !=tmp_result: # print("Update from '{}' to '{}', {}".format(original_ref_string, tmp_result, original_ref_string==tmp_result)) calculator2 = UKKLevenshteinDistanceCalculator( tokenizer=WordTokenizer(), get_alignment_result=True) update_result = calculator2.get_distance( tmp_result, output_string).alignment_result return update_result
def update_alignment_result(alignment_result): # alignment_result = result.alignment_result aligned_tokens_list = alignment_result.aligned_tokens_list calculator = UKKLevenshteinDistanceCalculator(tokenizer=WordTokenizer(), get_alignment_result=False) output_string = alignment_result.get_outputs_str() old_distance = alignment_result.calculate_three_kinds_of_distance()[0] generator = SimpleReferenceCombinationGenerator() tmp_result = None for index in range(0, len(alignment_result)): # if aligned_tokens_list[index].reference.isdigit() is True: result_digit = our_is_digit(aligned_tokens_list[index].reference) if result_digit is not False: for r in result_digit: generator.add_new_token_options(r) else: generator.add_new_token_options( aligned_tokens_list[index].reference) # print('generator.get_all_reference()', generator.get_all_reference()) for x in generator.get_all_reference(): x = " ".join(x) distance = calculator.get_distance(x, output_string).distance # print('x', x) # print('output_string', output_string) # print('distance', distance) if distance < old_distance: old_distance = distance tmp_result = x if tmp_result is None: return None calculator2 = UKKLevenshteinDistanceCalculator(tokenizer=WordTokenizer(), get_alignment_result=True) update_result = calculator2.get_distance(tmp_result, output_string).alignment_result return update_result
def run_transcription_compare(reference_path, output_file_list, output_html_path): logging.info("Start to compare results") with open(reference_path, "r", encoding='utf-8') as reference_file: reference_text = reference_file.read() calculator = UKKLevenshteinDistanceCalculator( tokenizer=WordTokenizer(), get_alignment_result=True, local_optimizers=[ DigitUtil(process_output_digit=True), LocalCerOptimizer() ]) output_all = dict() # (output identifier -> output string) for output_path in output_file_list: with open(output_path, "r", encoding='utf-8') as output_file: output_text = output_file.read() output_path_name = os.path.basename(output_path) output_all[output_path_name] = output_text logging.info("Finish reading all results") output_results = dict() # (output_identifier -> output_string) for (key, value) in output_all.items(): logging.info("Start to process {}".format(key)) output_results[key] = calculator.get_distance( reference_text, value, brackets_list=["[]", "()", "<>"], to_lower=True, remove_punctuation=True, use_alternative_spelling=True) logging.info("Merge all results into one HTML") calculator_local = UKKLevenshteinDistanceCalculator( tokenizer=CharacterTokenizer(), get_alignment_result=False) result = MultiResult(output_results, calculator_local) s = result.to_html() with open(output_html_path, 'w') as f: f.write(s)
class TestCerFirst(unittest.TestCase): """ """ def setUp(self) -> None: self.word_tokenizer = WordTokenizer() def test_into(self): """ """ r = self.word_tokenizer.tokenize(token_string="into", brackets_list=[]) self.assertEqual(r, ["into"]) def test_into_2(self): """ """ r = self.word_tokenizer.tokenize(token_string="in to", brackets_list=[]) self.assertEqual(r, ["in", "to"]) def test_into_3(self): """ """ r = self.word_tokenizer.tokenize(token_string="into the monitor", brackets_list=[]) self.assertEqual(r, ["into", "the", "monitor"]) def test_into_4(self): """ """ r = self.word_tokenizer.tokenize(token_string="into", brackets_list=["()"]) self.assertEqual(r, ["into"]) def test_into_5(self): """ """ r = self.word_tokenizer.tokenize(token_string="in to", brackets_list=["()"]) self.assertEqual(r, ["in", "to"]) def test_into_6(self): """ """ r = self.word_tokenizer.tokenize(token_string="into the monitor", brackets_list=["()"]) self.assertEqual(r, ["into", "the", "monitor"])
def update_alignment_result(self, alignment_result, process_output_digit): word_tokenizer = WordTokenizer() calculator = UKKLevenshteinDistanceCalculator( tokenizer=None, get_alignment_result=False) # get output token list output_token_list = alignment_result.get_outputs() reference_token_list = alignment_result.get_reference() old_distance = alignment_result.calculate_three_kinds_of_distance()[0] generator = SimpleReferenceCombinationGenerator() tmp_result = None no_digit = True if process_output_digit: token_list_to_check_digit = output_token_list else: token_list_to_check_digit = reference_token_list for current_str in token_list_to_check_digit: result_digit = self.our_is_digit(current_str) if result_digit: no_digit = False for r in result_digit: # tokenize the string tokenized_r = [] for option in r: tokenized_r.append( word_tokenizer.tokenize(option, to_lower=True, remove_punctuation=True)) generator.add_new_token_options(tokenized_r) else: generator.add_new_token_options([current_str]) if no_digit: return None for x in generator.get_all_reference(): if process_output_digit: distance = calculator.get_result_from_list( reference_token_list, x).distance else: distance = calculator.get_result_from_list( x, output_token_list).distance if distance < old_distance: old_distance = distance tmp_result = x if tmp_result is None: return None calculator2 = UKKLevenshteinDistanceCalculator( tokenizer=None, get_alignment_result=True) if process_output_digit: update_result = calculator2.get_result_from_list( reference_token_list, tmp_result).alignment_result else: update_result = calculator2.get_result_from_list( tmp_result, output_token_list).alignment_result return update_result
def setUp(self) -> None: self.word_tokenizer = WordTokenizer()
def setUp(self) -> None: self.calculator = UKKLevenshteinDistanceCalculator( tokenizer=CharacterTokenizer(), get_alignment_result=True) self.wer_calculator = UKKLevenshteinDistanceCalculator( tokenizer=WordTokenizer(), get_alignment_result=True)
from transcription_compare.levenshtein_distance_calculator import UKKLevenshteinDistanceCalculator from transcription_compare.tokenizer import WordTokenizer import re calculator = UKKLevenshteinDistanceCalculator(tokenizer=WordTokenizer(), get_alignment_result=True, digit_util=None) reference = 'APPLE BANANA WATER BYE OK NO PROBLEM TIME LOG SEARCH' output = 'APPLE BANANA HELLO WATER HA NO TIME LOG YES ' length = len(reference) for i in range(1, length): b = re.split(r" +", reference) print(b) def cut_text(text, lenth): textArr = re.findall('.{' + str(lenth) + '}', text) textArr.append(text[(len(textArr) * lenth):]) return textArr reference = 'APPLE BANANA WATER BYE OK NO PROBLEM TIME LOG SEARCH' output = 'APPLE BANANA HELLO WATER HA NO TIME LOG YES ' spaces_count = reference.count(' ') length = len(reference) print(length) # print(spaces_count//2) #根据spaces的个数去判断 spaces_spot = [k for k in range(len(reference)) if reference.find(' ', k) == k] print(spaces_spot)
print('current_reference', all_reference[0]) print('current_output', current_output) d += calculator.get_distance(all_reference[0], current_output[0]).distance d += calculator.get_distance(all_reference[1], current_output[1]).distance print('old_distance', old_distance) print('d', d) if d < old_distance: old_distance = distance tmp_result = current_output print('tmp_result', tmp_result) if tmp_result is None: pass calculator2 = UKKLevenshteinDistanceCalculator( tokenizer=WordTokenizer(), get_alignment_result=True) update_result = calculator2.get_distance( all_reference[0], " ".join(tmp_result[0])).alignment_result update_result += calculator2.get_distance( all_reference[1], " ".join(tmp_result[1])).alignment_result print(update_result) def update_alignment_result_word(alignment_result): # fist check same character for row in alignment_result: if row.reference in row.output: index = list.index(row.reference) else: index = 0 # sort