Beispiel #1
0
def main(reference, output, reference_file, output_file, alignment, error_type,
         output_format, digit):
    """
    Transcription compare tool provided by VoiceGain
    """
    if reference is not None:
        reference = reference
    elif reference_file is not None:
        # with open(reference_file, 'r') as file1:
        reference = reference_file.read()
    else:
        raise ValueError(
            "One of --reference and --reference_file must be specified")

    if output is not None:
        output = output
    elif output_file is not None:
        # with open(output_file, 'r') as file2:
        output = output_file.read()
    else:
        raise ValueError("One of --output and --output_file must be specified")

    if error_type == "CER":

        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=CharacterTokenizer(), get_alignment_result=alignment)
    else:
        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=WordTokenizer(), get_alignment_result=alignment)

    if output_format == 'TABLE':
        alignment_result = calculator.get_distance(reference,
                                                   output).alignment_result
        error_list = alignment_result.get_error_section_list()
        for e in error_list:
            print("+++++++++++++++")
            print(e.original_alignment_result)
            # updated_alignment_result = update_alignment_result(e.original_alignment_result)
            updated_alignment_result = update_alignment_result(
                e.original_alignment_result)
            e.set_correction(updated_alignment_result)

        alignment_result.apply_error_section_list(error_list)
        click.echo(alignment_result)

    if output_format == 'JSON':
        alignment_result = calculator.get_distance(reference,
                                                   output).alignment_result
        error_list = alignment_result.get_error_section_list()
        for e in error_list:
            print("+++++++++++++++")
            print(e.original_alignment_result)
            # updated_alignment_result = update_alignment_result(e.original_alignment_result)
            updated_alignment_result = update_alignment_result(
                e.original_alignment_result)
            e.set_correction(updated_alignment_result)

        alignment_result.apply_error_section_list(error_list)
        distance = alignment_result.calculate_three_kinds_of_distance()[0]
        click.echo(alignment_result.to_json())
def run_transcription_compare(reference_path, output_file_list,
                              output_html_path):
    logging.info("Start to compare results")

    with open(reference_path, "r", encoding='utf-8') as reference_file:
        reference_text = reference_file.read()

    calculator = UKKLevenshteinDistanceCalculator(
        tokenizer=WordTokenizer(),
        get_alignment_result=True,
        local_optimizers=[
            DigitUtil(process_output_digit=True),
            LocalCerOptimizer()
        ])

    output_all = dict()  # (output identifier -> output string)
    for output_path in output_file_list:
        with open(output_path, "r", encoding='utf-8') as output_file:
            output_text = output_file.read()
        output_path_name = os.path.basename(output_path)
        output_all[output_path_name] = output_text
    logging.info("Finish reading all results")

    output_results = dict()  # (output_identifier -> output_string)
    for (key, value) in output_all.items():
        logging.info("Start to process {}".format(key))
        output_results[key] = calculator.get_distance(
            reference_text,
            value,
            brackets_list=["[]", "()", "<>"],
            to_lower=True,
            remove_punctuation=True,
            use_alternative_spelling=True)

    logging.info("Merge all results into one HTML")
    calculator_local = UKKLevenshteinDistanceCalculator(
        tokenizer=CharacterTokenizer(), get_alignment_result=False)

    result = MultiResult(output_results, calculator_local)
    s = result.to_html()

    with open(output_html_path, 'w') as f:
        f.write(s)
Beispiel #3
0
    def update_alignment_result_error_section(self,
                                              alignment_result_error_section):
        # print('hahahahahahah')
        alignment_result_options = alignment_result_error_section.get_all_options(
        )
        # print('alignment_result_options', alignment_result_options)
        if alignment_result_options is None:
            return None

        calculator = UKKLevenshteinDistanceCalculator(
            tokenizer=CharacterTokenizer(), get_alignment_result=False)

        old_distance = alignment_result_error_section.original_alignment_result.get_total_cer(
            calculator)
        # print('old_distance', old_distance)
        tmp_result = None
        for alignment_result_option in alignment_result_options:
            d = alignment_result_option.get_total_cer(calculator)
            # print('d', d)
            if d < old_distance:
                old_distance = d
                tmp_result = alignment_result_option
        # print('tmp_result', tmp_result)
        return tmp_result
from transcription_compare.levenshtein_distance_calculator import UKKLevenshteinDistanceCalculator
from transcription_compare.tokenizer import CharacterTokenizer

calculator = UKKLevenshteinDistanceCalculator(tokenizer=CharacterTokenizer(),
                                              get_alignment_result=True)

print(calculator.get_distance("abc", "abd"))
print(calculator.get_distance("abc", "abcd"))
print(calculator.get_distance("AV", "Abc"))

print(calculator.get_distance('batman', 'b'))

print(calculator.get_distance('b', 'batman'))
print(calculator.get_distance('batman', 'b'))
print(calculator.get_distance('AVERY', 'GARVEY'))
print(calculator.get_distance('ernest',
                              'nester'))  #except this one; befor the 1 c
print(calculator.get_distance('werewolf', 'were  wolf'))
print(calculator.get_distance('jijizhazha',
                              'hahahaaaa???'))  #excep; befor the 1 c
print(calculator.get_distance('helloa a a ?', 'HHHHHHHoooooo'))
print(calculator.get_distance('happyeveryday', 'happybirthday'))
Beispiel #5
0
 def setUp(self) -> None:
     self.calculator = UKKLevenshteinDistanceCalculator(
         tokenizer=CharacterTokenizer(), get_alignment_result=True)
# alignment_result.add_token(ref_token="someday", output_tokens=["xi"], add_to_left=False)
alignment_result.add_token(ref_token="someday",
                           output_tokens=["ays"],
                           add_to_left=False)
alignment_result.merge_none_tokens()
print('alignment_result', alignment_result)

error_list = alignment_result.get_error_section_list()
for e in error_list:
    alignment_result_options = e.get_all_options()

    if alignment_result_options is None:
        continue

    calculator = UKKLevenshteinDistanceCalculator(
        tokenizer=CharacterTokenizer(), get_alignment_result=False)

    old_distance = e.original_alignment_result.get_total_cer(calculator)

    tmp_result = None
    for alignment_result_option in alignment_result_options:
        d = alignment_result_option.get_total_cer(calculator)
        if d < old_distance:
            old_distance = d
            tmp_result = alignment_result_option

    if tmp_result is None:
        continue
    e.set_correction(tmp_result)

    #  correct后的或者一开始就没有进去的都会apply back