def _calc_metrics(self, ground_truth, hypothesis): transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.RemoveMultipleSpaces(), jiwer.RemoveWhiteSpace(replace_by_space=" "), jiwer.SentencesToListOfWords(word_delimiter=" ") ]) mer = jiwer.mer(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) wer = jiwer.wer(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) wil = jiwer.wil(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) wip = jiwer.wip(ground_truth, hypothesis, truth_transform=transformation, hypothesis_transform=transformation) return mer, wer, wil, wip
def compute_lipnet_wer(model, videos, device, verbose=False): from jiwer import wer, mer, wil lipnet_model = lipnet.get_model(device) wer_sum = 0. mer_sum = 0. wil_sum = 0. for video in videos: latentfile = f"{latent_root}{video}/mean.latent.pt" sentence = f"{latent_root}{video}/" transcriptfile = f"{transcript_root}{video}.transcript.txt" # Load transcript transcript = lipnet.read_transcript(transcriptfile) # Create video max_sec = 30 if dataset == 'AudioVisualDataset' else None max_sec = 1 if args.verbose else max_sec vid = model(test_latent=latentfile, test_sentence_path=sentence, audio_multiplier=args.audio_multiplier, audio_truncation=args.audio_truncation, max_sec=max_sec) vid = (np.rollaxis(vid.numpy(), 1, 4) * 255.).astype(np.uint8) vid = lipnet.prepare_video(vid, device, verbose=args.verbose) prediction = lipnet.lipnet_predict(vid, lipnet_model) if prediction is None: continue transcript = lipnet.read_transcript(transcriptfile) wer_error = wer(transcript, prediction) mer_error = mer(transcript, prediction) wil_error = wil(transcript, prediction) wer_sum += wer_error mer_sum += mer_error wil_sum += wil_error print( f"WER {wer_error:.4f} - MER {mer_error:.4f} - WIL {wil_error:.4f} prediction: {prediction} | transcript: {transcript}" ) print(f"Mean WER {wer_sum / len(videos):.4f}") print(f"Mean MER {mer_sum / len(videos):.4f}") print(f"Mean WIL {wil_sum / len(videos):.4f}")
def _compute_wil_metric_jiwer(preds: Union[str, List[str]], target: Union[str, List[str]]): return wil(target, preds)