Example #1
0
class WerScorer(BaseScorer):
    def __init__(self, cfg):
        super().__init__(cfg)
        self.reset()
        try:
            import editdistance as ed
        except ImportError:
            raise ImportError("Please install editdistance to use WER scorer")
        self.ed = ed
        self.tokenizer = EvaluationTokenizer(
            tokenizer_type=self.cfg.wer_tokenizer,
            lowercase=self.cfg.wer_lowercase,
            punctuation_removal=self.cfg.wer_remove_punct,
            character_tokenization=self.cfg.wer_char_level,
        )

    def reset(self):
        self.distance = 0
        self.ref_length = 0

    def add_string(self, ref, pred):
        ref_items = self.tokenizer.tokenize(ref).split()
        pred_items = self.tokenizer.tokenize(pred).split()
        self.distance += self.ed.eval(ref_items, pred_items)
        self.ref_length += len(ref_items)

    def result_string(self):
        return f"WER: {self.score():.2f}"

    def score(self):
        return 100.0 * self.distance / self.ref_length if self.ref_length > 0 else 0
Example #2
0
class SacrebleuScorer(BaseScorer):
    def __init__(self, cfg):
        super(SacrebleuScorer, self).__init__(cfg)
        import sacrebleu

        self.sacrebleu = sacrebleu
        self.tokenizer = EvaluationTokenizer(
            tokenizer_type=cfg.sacrebleu_tokenizer,
            lowercase=cfg.sacrebleu_lowercase,
            character_tokenization=cfg.sacrebleu_char_level,
        )

    def add_string(self, ref, pred):
        self.ref.append(self.tokenizer.tokenize(ref))
        self.pred.append(self.tokenizer.tokenize(pred))

    def score(self, order=4):
        return self.result_string(order).score

    def result_string(self, order=4):
        if order != 4:
            raise NotImplementedError
        # tokenization and lowercasing are performed by self.tokenizer instead.
        return self.sacrebleu.corpus_bleu(
            self.pred, [self.ref], tokenize="none"
        ).format()
Example #3
0
 def __init__(self, args):
     super(SacrebleuScorer, self).__init__(args)
     import sacrebleu
     self.sacrebleu = sacrebleu
     self.tokenizer = EvaluationTokenizer(
         tokenizer_type=self.args.sacrebleu_tokenizer,
         lowercase=self.args.sacrebleu_lowercase,
         character_tokenization=self.args.sacrebleu_char_level)
Example #4
0
    def __init__(self, cfg):
        super(SacrebleuScorer, self).__init__(cfg)
        import sacrebleu

        self.sacrebleu = sacrebleu
        self.tokenizer = EvaluationTokenizer(
            tokenizer_type=cfg.sacrebleu_tokenizer,
            lowercase=cfg.sacrebleu_lowercase,
            character_tokenization=cfg.sacrebleu_char_level,
        )
Example #5
0
 def __init__(self, args):
     super().__init__(args)
     self.reset()
     try:
         import editdistance as ed
     except ImportError:
         raise ImportError("Please install editdistance to use WER scorer")
     self.ed = ed
     self.tokenizer = EvaluationTokenizer(
         tokenizer_type=self.args.wer_tokenizer,
         lowercase=self.args.wer_lowercase,
         punctuation_removal=self.args.wer_remove_punct,
         character_tokenization=self.args.wer_char_level,
     )
Example #6
0
class WerScorer(BaseScorer):
    def __init__(self, args):
        super().__init__(args)
        self.reset()
        try:
            import editdistance as ed
        except ImportError:
            raise ImportError("Please install editdistance to use WER scorer")
        self.ed = ed
        self.tokenizer = EvaluationTokenizer(
            tokenizer_type=self.args.wer_tokenizer,
            lowercase=self.args.wer_lowercase,
            punctuation_removal=self.args.wer_remove_punct,
            character_tokenization=self.args.wer_char_level,
        )

    @staticmethod
    def add_args(parser):
        # fmt: off
        parser.add_argument('--wer-tokenizer',
                            type=str,
                            default='none',
                            choices=EvaluationTokenizer.ALL_TOKENIZER_TYPES,
                            help='sacreBLEU tokenizer to use for evaluation')
        parser.add_argument('--wer-remove-punct',
                            action='store_true',
                            help='remove punctuation')
        parser.add_argument('--wer-char-level',
                            action='store_true',
                            help='evaluate at character level')
        parser.add_argument('--wer-lowercase',
                            action='store_true',
                            help='lowercasing')
        # fmt: on

    def reset(self):
        self.distance = 0
        self.ref_length = 0

    def add_string(self, ref, pred):
        ref_items = self.tokenizer.tokenize(ref).split()
        pred_items = self.tokenizer.tokenize(pred).split()
        self.distance += self.ed.eval(ref_items, pred_items)
        self.ref_length += len(ref_items)

    def result_string(self):
        return f"WER: {self.score():.2f}"

    def score(self):
        return 100.0 * self.distance / self.ref_length if self.ref_length > 0 else 0
Example #7
0
class SacrebleuScorer(BaseScorer):
    def __init__(self, args):
        super(SacrebleuScorer, self).__init__(args)
        import sacrebleu

        self.sacrebleu = sacrebleu
        self.tokenizer = EvaluationTokenizer(
            tokenizer_type=self.args.sacrebleu_tokenizer,
            lowercase=self.args.sacrebleu_lowercase,
            character_tokenization=self.args.sacrebleu_char_level,
        )

    @staticmethod
    def add_args(parser):
        # fmt: off
        parser.add_argument('--sacrebleu-tokenizer',
                            type=str,
                            default='13a',
                            choices=EvaluationTokenizer.ALL_TOKENIZER_TYPES,
                            help='tokenizer')
        parser.add_argument('--sacrebleu-lowercase',
                            type=str,
                            default=False,
                            help='apply lowercasing')
        parser.add_argument('--sacrebleu-char-level',
                            action='store_true',
                            help='evaluate at character level')
        # fmt: on

    def add_string(self, ref, pred):
        self.ref.append(self.tokenizer.tokenize(ref))
        self.pred.append(self.tokenizer.tokenize(pred))

    def score(self, order=4):
        return self.result_string(order).score

    def result_string(self, order=4):
        if order != 4:
            raise NotImplementedError
        # tokenization and lowercasing are performed by self.tokenizer instead.
        return self.sacrebleu.corpus_bleu(self.pred, [self.ref],
                                          tokenize="none").format()