Exemple #1
0
    def _calc_metrics(self, ground_truth, hypothesis):
        transformation = jiwer.Compose([
            jiwer.ToLowerCase(),
            jiwer.RemoveMultipleSpaces(),
            jiwer.RemoveWhiteSpace(replace_by_space=" "),
            jiwer.SentencesToListOfWords(word_delimiter=" ")
        ])

        mer = jiwer.mer(ground_truth,
                        hypothesis,
                        truth_transform=transformation,
                        hypothesis_transform=transformation)

        wer = jiwer.wer(ground_truth,
                        hypothesis,
                        truth_transform=transformation,
                        hypothesis_transform=transformation)

        wil = jiwer.wil(ground_truth,
                        hypothesis,
                        truth_transform=transformation,
                        hypothesis_transform=transformation)

        wip = jiwer.wip(ground_truth,
                        hypothesis,
                        truth_transform=transformation,
                        hypothesis_transform=transformation)

        return mer, wer, wil, wip
def compute_lipnet_wer(model, videos, device, verbose=False):
    from jiwer import wer, mer, wil
    lipnet_model = lipnet.get_model(device)
    wer_sum = 0.
    mer_sum = 0.
    wil_sum = 0.
    for video in videos:
        latentfile = f"{latent_root}{video}/mean.latent.pt"
        sentence = f"{latent_root}{video}/"
        transcriptfile = f"{transcript_root}{video}.transcript.txt"

        # Load transcript
        transcript = lipnet.read_transcript(transcriptfile)

        # Create video
        max_sec = 30 if dataset == 'AudioVisualDataset' else None
        max_sec = 1 if args.verbose else max_sec
        vid = model(test_latent=latentfile,
                    test_sentence_path=sentence,
                    audio_multiplier=args.audio_multiplier,
                    audio_truncation=args.audio_truncation,
                    max_sec=max_sec)
        vid = (np.rollaxis(vid.numpy(), 1, 4) * 255.).astype(np.uint8)

        vid = lipnet.prepare_video(vid, device, verbose=args.verbose)

        prediction = lipnet.lipnet_predict(vid, lipnet_model)
        if prediction is None:
            continue
        transcript = lipnet.read_transcript(transcriptfile)
        wer_error = wer(transcript, prediction)
        mer_error = mer(transcript, prediction)
        wil_error = wil(transcript, prediction)
        wer_sum += wer_error
        mer_sum += mer_error
        wil_sum += wil_error
        print(
            f"WER {wer_error:.4f} - MER {mer_error:.4f} - WIL {wil_error:.4f} prediction: {prediction} | transcript: {transcript}"
        )

    print(f"Mean WER {wer_sum / len(videos):.4f}")
    print(f"Mean MER {mer_sum / len(videos):.4f}")
    print(f"Mean WIL {wil_sum / len(videos):.4f}")
Exemple #3
0
def _compute_wil_metric_jiwer(preds: Union[str, List[str]], target: Union[str, List[str]]):
    return wil(target, preds)