Example #1
0
    def _compute_nltk_bleu(self, batch: Batch, texts: List[str]):
        """
        Compute BLEU score between text and label(s), using the NLTK BLEU Scorer.

        Note this differs from BLEU in ParlAI metrics in that the answers
        are unnormalized (no removal of stop words, etc.)

        :param batch:
            Batch of observations
        :param texts:
            list of string predictions
        """

        results = {}
        for i, p in enumerate(texts):
            obs = batch.observations[i]
            references = []
            for lbl in obs['eval_labels']:
                references.append(
                    self._v2t(
                        self._vectorize_text(
                            lbl, True, True, self.label_truncate, False
                        )
                    )
                )
            for k in range(1, 5):
                b = BleuMetric.compute(p, references, k)
                if b is None:
                    b = 0
                if k not in results:
                    results[k] = []
                results[k].append(b)

        for k in range(1, 5):
            self.record_local_metric(f'nltk_bleu{k}', results[k])
Example #2
0
    def custom_evaluation(self, teacher_action: Message, labels,
                          model_response: Message):
        resp = model_response.get('text')
        if not resp:
            return

        if teacher_action['type'] == 'apicall' and resp.startswith(
                'apicall: '):
            gold = teacher_action['slots']
            slot_strs = resp[9:].split(' ; ')
            parsed = {}
            for slot_str in slot_strs:
                if ' = ' not in slot_str:
                    if slot_str != '':
                        # syntactically invalid generations should count against us
                        self.metrics.add('slot_p', AverageMetric(0))
                    continue
                name, value = slot_str.split(' = ')
                parsed[name] = value

            # slot precision
            for k, v in parsed.items():
                self.metrics.add('slot_p', AverageMetric(v == gold.get(k)))
            # slot recall
            for k, v in gold.items():
                self.metrics.add('slot_r', AverageMetric(v == parsed.get(k)))
        elif teacher_action['type'] == 'apiresp':
            delex_resp = self._delex(resp, teacher_action['slots'])
            delex_label = self._delex(labels[0], teacher_action['slots'])
            self.metrics.add('delex_bleu',
                             BleuMetric.compute(delex_resp, [delex_label]))
Example #3
0
 def __init__(
     self,
     guess: str,
     labels: Optional[List[str]],
     prefixes: Optional[List[str]] = None,
     shared: Dict[str, Any] = None,
 ) -> None:
     super().__init__(shared=shared)
     self.prefixes = prefixes if prefixes else []
     bleu = BleuMetric.compute(guess, labels)
     f1 = F1Metric.compute(guess, labels)
     self.add_with_prefixes("nlg_bleu", bleu)
     self.add_with_prefixes("nlg_f1", f1)
 def handle_message_helper(
         self, prefix_stripped_text: str) -> Optional[Dict[str, Metric]]:
     here = [normalize_answer(x) for x in prefix_stripped_text.split(" ")]
     score = 1
     if len(self.turns) > 0:
         score = nltkbleu.corpus_bleu(
             [self.turns],
             [here],
             smoothing_function=nltkbleu.SmoothingFunction(
                 epsilon=1e-12).method1,
             weights=[1.0 / 3.0] * 3,
         )
     self.turns.append(here)
     return {self.metric_key(): BleuMetric(score)}
Example #5
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ) -> None:
        """
        Various F1 metrics for the generated model response.
        """
        if not model_response.get('text'):
            # No response generated by model.
            return

        resp = model_response['text']
        # F1 metric over the *selected* knowledge.
        self.metrics.add(
            'knowledge_f1_docs',
            F1Metric.compute(resp, teacher_action[CONST.SELECTED_DOCS]),
        )
        self.metrics.add(
            'knowledge_f1_sentences',
            F1Metric.compute(resp, teacher_action[CONST.SELECTED_SENTENCES]),
        )

        # F1 Metrics over the *retrieved* docs.
        self.metrics.add(
            'f1_retrieved_docs',
            F1Metric.compute(resp,
                             ' '.join(teacher_action[CONST.RETRIEVED_DOCS])),
        )
        self.metrics.add(
            'max_f1_retrieved_docs',
            F1Metric.compute(resp, teacher_action[CONST.RETRIEVED_DOCS]),
        )

        selected_doc_senetences = teacher_action[CONST.SELECTED_DOCS][0].split(
            '\n')
        all_doc_senetences = []
        for doc in teacher_action[CONST.RETRIEVED_DOCS]:
            all_doc_senetences.extend(doc.split('\n'))

        self.metrics.add('exact_copied_sentences',
                         ExactMatchMetric.compute(resp, all_doc_senetences))
        self.metrics.add(
            'max_substring_copied_sentences',
            CopiedSubstringMetric.compute(resp, all_doc_senetences),
        )
        self.metrics.add(
            'max_substring_copied_docs',
            CopiedSubstringMetric.compute(
                resp, teacher_action[CONST.RETRIEVED_DOCS]),
        )
        self.metrics.add(
            'substring_copied_docs',
            CopiedSubstringMetric.compute(
                resp, [''.join(teacher_action[CONST.RETRIEVED_DOCS])]),
        )
        self.metrics.add(
            'max_f1_selected_docs_senetences',
            F1Metric.compute(resp, selected_doc_senetences),
        )
        self.metrics.add('max_f1_docs_senetences',
                         F1Metric.compute(resp, all_doc_senetences))

        # N-gram matching metrics
        for k in range(1, 5):  # 1..4
            self.metrics.add(
                f'max_bleu_selected_docs_senetences-{k}',
                BleuMetric.compute(resp, selected_doc_senetences, k),
            )

        r1, r2, rL = RougeMetric.compute_many(resp, selected_doc_senetences)
        self.metrics.add('max_rouge_selected_docs_senetences_1', r1)
        self.metrics.add('max_rouge_selected_docs_senetences_2', r2)
        self.metrics.add('max_rouge_selected_docs_senetences_L', rL)
Example #6
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ):
        if 'metrics' in model_response and 'type' in teacher_action:
            # keep copies of metrics across both api calls/responses
            prefix = teacher_action['type']
            keys = list(model_response['metrics'].keys())
            for k in keys:
                self.metrics.add(f'{prefix}_{k}', model_response['metrics'][k])

        if 'text' not in model_response or not labels or 'type' not in teacher_action:
            return

        domain = teacher_action['domain']

        if teacher_action['type'] == 'apicall':
            # also count slot accuracy
            text = model_response['text']
            slot_guesses = set(
                text.replace(
                    CALL_TOKEN + " ",
                    "").split(' ; '))  # prevent cheating via repeated guesses
            correct = 0
            for slot_guess in slot_guesses:
                if ' = ' not in slot_guess:
                    continue
                try:
                    slot, guess = slot_guess.split(' = ')
                except ValueError:
                    continue
                if teacher_action['slots'].get(slot) == guess:
                    self.metrics.add('slot_p', AverageMetric(1))
                    self.metrics.add(f'{domain}_slot_p', AverageMetric(1))
                    correct += 1
                else:
                    self.metrics.add('slot_p', AverageMetric(0))
                    self.metrics.add(f'{domain}_slot_p', AverageMetric(0))
                    logging.debug(
                        f"Bad slot guess '{slot_guess}' != {teacher_action['slots']}"
                    )
            if teacher_action['slots']:
                self.metrics.add(
                    'slot_r',
                    AverageMetric(correct, len(teacher_action['slots'])))
                self.metrics.add(
                    f'{domain}_slot_r',
                    AverageMetric(correct, len(teacher_action['slots'])),
                )
                self.metrics.add(
                    'jga',
                    AverageMetric(correct == len(teacher_action['slots'])))

        elif teacher_action['type'] == 'apiresp':
            # keep track of statistics by domain
            f1_metric = F1Metric.compute(model_response['text'], labels)
            bleu_metric = BleuMetric.compute(model_response['text'], labels)
            self.metrics.add(f'{domain}_lex_f1', f1_metric)
            self.metrics.add(f'{domain}_lex_bleu', bleu_metric)

            delex_text = model_response['text']
            delex_label = labels[0]
            # compute delexicalized string metrics
            for slot, value in teacher_action['slots'].items():
                delex_text = delex_text.replace(value, slot)
                delex_label = delex_label.replace(value, slot)
            f1_metric = F1Metric.compute(delex_text, (delex_label, ))
            self.metrics.add('delex_f1', f1_metric)
            self.metrics.add(f'{domain}_delex_f1', f1_metric)
            bleu_metric = BleuMetric.compute(delex_text, [delex_label])
            self.metrics.add('delex_bleu', bleu_metric)
            self.metrics.add(f'{domain}_delex_bleu', bleu_metric)