Exemple #1
0
def clean(content):
    sentence = jiwer.RemoveKaldiNonWords()(content)
    sentence = sentence.replace("^", "ć")
    sentence = jiwer.SubstituteRegexes({
        r"{": r"š",
        r"`": r"ž",
        r"}": r"đ",
        r"~": r"č",
        r"#": r"dž"
    })(sentence)
    sentence = jiwer.RemoveMultipleSpaces()(sentence)
    return sentence
Exemple #2
0
def get_paired_text_corrected(batch):
    sentences = list()
    for filename in batch:
        with open(path_txt + filename, "r") as f:
            sentence = jiwer.RemoveKaldiNonWords()(f.read())
            sentence = sentence.replace("^", "ć")
            sentences.append(jiwer.RemoveMultipleSpaces()(sentence))
            sentences = jiwer.SubstituteRegexes({
                r"{": r"š",
                r"`": r"ž",
                r"}": r"đ",
                r"~": r"č",
                r"#": r"dž"
            })(sentences)
    return sentences
Exemple #3
0
def compute_perc_script_missing(original_script, transcript, language):
    '''
    Check how much of original_script is missing in transcript. Clean and remove stopwords
    '''
    # print(original_script)
    # print(transcript)

    cleaning = jiwer.Compose([
        jiwer.SubstituteRegexes({"¡": "", "¿":"", "á": "a", "é": "e", "í": "i", "ó": "o","ú": "u"}),
        jiwer.SubstituteWords({ "tardes": "dias",
                                "noches": "dias",
                                " uno ": " 1 ",
                                " dos ": " 2 ",
                                " tres ": " 3 ",
                                " cuatro ": " 4 ",
                                " cinco ": " 5 ",
                                " seis ": " 6 ",
                                " siete ": " 7 ",
                                " ocho ": " 8 ",
                                " nueve ": " 9 "}),
        jiwer.RemovePunctuation(),
        jiwer.ToLowerCase(),
        jiwer.SentencesToListOfWords(word_delimiter=" "),
        jiwer.RemoveEmptyStrings()
    ])

    #Remove anything between ${variable} from original_script
    original_script_transformed = re.sub(r'\${.*?\}','',original_script)
    # print(original_script_transformed)
    #Clean both
    original_script_transformed = cleaning(original_script_transformed)
    transcript_transformed = cleaning(transcript)
    # print(original_script_transformed)


    #Remove stopwords from original_script
    original_script_transformed_no_stopwords = remove_stopwords(original_script_transformed, language)
    if len(original_script_transformed_no_stopwords) != 0: #Sometimes removing stopwords removes all words from script
        original_script_transformed = original_script_transformed_no_stopwords

    #Lemmatize transcript
    stemmer = get_stemmer(language)
    transcript_transformed_stem = [stemmer.stem(word) for word in transcript_transformed]

    #Get words form original_script_transformed whose stem is not in transcript_transformed_stem
    words_missing = [word for word in original_script_transformed if stemmer.stem(word) not in transcript_transformed_stem]

    return len(words_missing)/len(original_script_transformed), words_missing
Exemple #4
0
                    end = next_sub.end.hours * 3600 + next_sub.end.minutes * 60 + next_sub.end.seconds + next_sub.end.milliseconds / 1000

                    ground_truth = ground_truth + " " + next_sub.text_without_tags
                    hypothesis = kd.query_text(start, end)
                else:
                    break
            kd.mark_words(start, end)

            transformation = jiwer.Compose([
                jiwer.ToLowerCase(),
                jiwer.RemoveMultipleSpaces(),
                jiwer.RemoveWhiteSpace(replace_by_space=True),
                jiwer.SentencesToListOfWords(),
                jiwer.RemovePunctuation(),
                jiwer.RemoveEmptyStrings(),
                jiwer.SubstituteRegexes({r"ё": r"е"})
            ])
            gt = transformation([ground_truth])
            hp = transformation([hypothesis])

            gt, hp = replace_pairs(gt, hp)
            hp, gt = replace_pairs(hp, gt)

            wer(gt, hp)

            r = jiwer.compute_measures(
                gt,
                hp
            )
            print(f"\nWER:{r['wer'] * 100:.3f}\t\tS:{r['S']} D:{r['D']} H:{r['H']} I:{r['I']}\n")
Exemple #5
0
import jiwer
import textwrap
import regex


# jiwer.RemovePunctuation removes string.punctuation not all Unicode punctuation
class RemovePunctuation(jiwer.AbstractTransform):
    def process_string(self, s: str):
        return regex.sub(r"\p{P}", "", s)


# remove some differences that we don't care about for comparisons
transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    RemovePunctuation(),
    jiwer.SubstituteRegexes(
        {r"\b(uh|um|ah|hi|alright|all right|well|kind of)\b": ""}),
    jiwer.SubstituteWords({
        "one": "1", "two": "2", "three": "3", "four": "4", "five": "5",
        "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10",
        "plus": "+", "minus": "-",
        "check out": "checkout", "hard point": "hardpoint"}),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.SentencesToListOfWords(),
    jiwer.RemoveEmptyStrings()
])

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("json_path")
    parser.add_argument("--verbose", action="store_true")
Exemple #6
0
                        break
                    tstart = next_sub.start.hours * 3600 + next_sub.start.minutes * 60 + next_sub.start.seconds + next_sub.start.milliseconds / 1000
                    if (tstart - end) > 0.5:
                        srt.push(next_sub)
                        break
                    end = next_sub.end.hours * 3600 + next_sub.end.minutes * 60 + next_sub.end.seconds + next_sub.end.milliseconds / 1000

                    ground_truth = ground_truth + " " + next_sub.text_without_tags
                    hypothesis = kd.query_text(start, end)
                else:
                    break
            kd.mark_words(start, end)

            transformation = jiwer.Compose([
                jiwer.ToLowerCase(),
                jiwer.SubstituteRegexes({r"…|–|«|»": r""}),
                jiwer.RemoveMultipleSpaces(),
                jiwer.RemoveWhiteSpace(replace_by_space=True),
                jiwer.SentencesToListOfWords(),
                jiwer.RemovePunctuation(),
                jiwer.RemoveEmptyStrings(),
                jiwer.SubstituteRegexes({r"ё": r"е"}),
            ])
            gt = transformation([ground_truth])
            hp = transformation([hypothesis])

            gt, hp = replace_pairs(gt, hp)
            hp, gt = replace_pairs(hp, gt)

            wer(gt, hp)