def _calc_metrics(self, ground_truth, hypothesis): transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.RemoveMultipleSpaces(), jiwer.RemoveWhiteSpace(replace_by_space=" "), jiwer.SentencesToListOfWords(word_delimiter=" ") ]) mer = jiwer.mer(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) wer = jiwer.wer(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) wil = jiwer.wil(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) wip = jiwer.wip(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) return mer, wer, wil, wip
def _str_clean(input_string: str) -> str: """ Use jiwer's """ transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.RemoveMultipleSpaces(), jiwer.RemoveWhiteSpace(replace_by_space=True), jiwer.SentencesToListOfWords(word_delimiter=" ") ]) return transformation(input_string)
def sentence_wer(reference: str, prediction: str): transformation = jiwer.Compose([ jiwer.RemoveMultipleSpaces(), jiwer.RemovePunctuation(), jiwer.Strip(), jiwer.ToLowerCase(), jiwer.ExpandCommonEnglishContractions(), jiwer.RemoveWhiteSpace(replace_by_space=True), jiwer.SentencesToListOfWords(), jiwer.RemoveEmptyStrings(), ]) return jiwer.wer(reference.strip(), prediction.strip(), truth_transform=transformation, hypothesis_transform=transformation)
def normalize_sentence(sentence): """Normalize sentence""" # Convert all characters to upper. sentence = sentence.upper() # Delete punctuations. sentence = jiwer.RemovePunctuation()(sentence) # Remove \n, \t, \r, \x0c. sentence = jiwer.RemoveWhiteSpace(replace_by_space=True)(sentence) # Remove multiple spaces. sentence = jiwer.RemoveMultipleSpaces()(sentence) # Remove white space in two end of string. sentence = jiwer.Strip()(sentence) # Convert all characters to upper. sentence = sentence.upper() return sentence
def analyze(): try: req_data = request.get_json() compose_rule_set = [] if req_data.get('to_lower_case', False) == True: compose_rule_set.append(jiwer.ToLowerCase()) if req_data.get('strip_punctuation', False) == True: compose_rule_set.append(jiwer.RemovePunctuation()) if req_data.get('strip_words', False) == True: compose_rule_set.append(jiwer.Strip()) if req_data.get('strip_multi_space', False) == True: compose_rule_set.append(jiwer.RemoveMultipleSpaces()) word_excepts = req_data.get('t_words', '') if word_excepts != '': words = [a.strip() for a in word_excepts.split(",")] compose_rule_set.append(jiwer.RemoveSpecificWords(words)) compose_rule_set.append( jiwer.RemoveWhiteSpace( replace_by_space=req_data.get('replace_whitespace', False))) transformation = jiwer.Compose(compose_rule_set) measures = jiwer.compute_measures(req_data.get('s_truth', ""), req_data.get('s_hypo', ""), truth_transform=transformation, hypothesis_transform=transformation) return jsonify({ "wer": measures['wer'], "mer": measures['mer'], "wil": measures['wil'] }) except: return jsonify("API endpoint Error")
def metric(ref_trans, asr_trans, lang): if lang == "en": transformation = jiwer.Compose([ jiwer.Strip(), jiwer.ToLowerCase(), jiwer.RemoveWhiteSpace(replace_by_space=True), jiwer.RemoveMultipleSpaces(), jiwer.SentencesToListOfWords(word_delimiter=" "), jiwer.RemoveEmptyStrings(), jiwer.RemovePunctuation(), ]) wer = jiwer.wer( ref_trans, asr_trans, truth_transform=transformation, hypothesis_transform=transformation, ) elif lang == "cn": del_symblos = re.compile(r"[^\u4e00-\u9fa5]+") for idx in range(len(asr_trans)): sentence = re.sub(del_symblos, "", asr_trans[idx]) sentence = list(sentence) sentence = " ".join(sentence) asr_trans[idx] = sentence sentence = re.sub(del_symblos, "", ref_trans[idx]) sentence = list(sentence) sentence = " ".join(sentence) ref_trans[idx] = sentence asr_valid = set(asr_trans) assert len(asr_valid) == len(asr_trans) wer = jiwer.wer(ref_trans, asr_trans) else: raise ("Args error!") return wer
tstart = next_sub.start.hours * 3600 + next_sub.start.minutes * 60 + next_sub.start.seconds + next_sub.start.milliseconds / 1000 if (tstart - end) > 0.5: srt.push(next_sub) break end = next_sub.end.hours * 3600 + next_sub.end.minutes * 60 + next_sub.end.seconds + next_sub.end.milliseconds / 1000 ground_truth = ground_truth + " " + next_sub.text_without_tags hypothesis = kd.query_text(start, end) else: break kd.mark_words(start, end) transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.RemoveMultipleSpaces(), jiwer.RemoveWhiteSpace(replace_by_space=True), jiwer.SentencesToListOfWords(), jiwer.RemovePunctuation(), jiwer.RemoveEmptyStrings(), jiwer.SubstituteRegexes({r"ё": r"е"}) ]) gt = transformation([ground_truth]) hp = transformation([hypothesis]) gt, hp = replace_pairs(gt, hp) hp, gt = replace_pairs(hp, gt) wer(gt, hp) r = jiwer.compute_measures( gt,
text = text.replace(p, ' ') return text def RemovePunctuation(replace_by_space=False): if not replace_by_space: return jiwer.RemovePunctuation() return replace_punctuations_by_space transformation = jiwer.Compose([ jiwer.ToLowerCase(), RemovePunctuation(replace_by_space=False), jiwer.RemoveMultipleSpaces(), jiwer.Strip(), jiwer.RemoveEmptyStrings(), jiwer.SentencesToListOfWords(), jiwer.RemoveWhiteSpace(replace_by_space=False), jiwer.RemoveEmptyStrings(), ]) def compute_avg_wer(ground_truth, hypothesis): assert len(ground_truth) == len(hypothesis) wer_sum = 0 for gt, h in zip(ground_truth, hypothesis): wer_score = jiwer.wer( gt, h, truth_transform=transformation, hypothesis_transform=transformation ) wer_sum += wer_score