Example #1
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ) -> None:
        if ((teacher_action[CONST.SELECTED_SENTENCES][0]
             == CONST.NO_SELECTED_SENTENCES_TOKEN)
                or (model_response.is_padding())
                or ('text' not in model_response)):
            # Has NOT selected knowledge or a is batch padding message
            return

        resp = model_response['text']
        self.metrics.add(
            'knowledge_f1_docs',
            F1Metric.compute(resp,
                             [' '.join(teacher_action[CONST.SELECTED_DOCS])]),
        )
        self.metrics.add('knowledge_f1_max_docs',
                         F1Metric.compute(resp, CONST.SELECTED_DOCS))
        self.metrics.add(
            'knowledge_f1_sentences',
            F1Metric.compute(
                resp, [' '.join(teacher_action[CONST.SELECTED_SENTENCES])]),
        )
        self.metrics.add(
            'knowledge_f1_max_sentences',
            F1Metric.compute(resp, CONST.SELECTED_SENTENCES),
        )
Example #2
0
 def compute(self, guess: str, answers: Iterable[str]) -> F1Metric:
     if guess is None or answers is None:
         return F1Metric(0, 0)
     guess = RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, guess)
     answers = [
         RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, a)
         for a in answers
     ]
     if not any(len(a) for a in answers):
         # no rare words in labels, set denominator to zero
         return F1Metric(0, 0)
     return F1Metric.compute(guess, answers)
Example #3
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ) -> None:
        if model_response.is_padding() or (not model_response.get(
                'text', None)):
            return

        expected_graph = break_knowledge_graph(labels[0].lower())
        predicted_graph = break_knowledge_graph(model_response['text'].lower())

        # Encoding the graph edges/mutation operations into ints for readily use of F1Metric
        expected_graph_enc, predicted_graph_enc = encode_set_elements(
            expected_graph, predicted_graph)
        self.metrics.add(
            'response_elements_f1',
            F1Metric.compute(
                guess=' '.join(predicted_graph_enc),
                answers=[' '.join(expected_graph_enc)],
            ),
        )

        # Subject, Relation F1
        # Changind "(MUT) < you , in , house >"   --into-->   "(MUT) < you , in "
        # This is to check F1 for the predicted subject and relation overlap.
        ekg_sub_rel = set([e.rsplit(',', 1)[0] for e in expected_graph])
        pkg_sub_rel = set([e.rsplit(',', 1)[0] for e in predicted_graph])
        ekg_sub_rel_ids, pkg_sub_rel_ids = encode_set_elements(
            ekg_sub_rel, pkg_sub_rel)
        self.metrics.add(
            'graph_subject_relation_f1',
            F1Metric.compute(guess=' '.join(pkg_sub_rel_ids),
                             answers=[' '.join(ekg_sub_rel_ids)]),
        )

        # Subject F1
        # Changind "(MUT) < you , in " (produced above)   --into-->   "(MUT) < you "
        # This is to check F1 for the predicted subject overlap.
        ekg_sub = set([e.split(',')[0] for e in ekg_sub_rel])
        pkg_sub = set([e.split(',')[0] for e in pkg_sub_rel])
        ekg_sub_ids, pkg_sub_ids = encode_set_elements(ekg_sub, pkg_sub)
        self.metrics.add(
            'graph_subject_f1',
            F1Metric.compute(guess=' '.join(pkg_sub_ids),
                             answers=[' '.join(ekg_sub_ids)]),
        )
Example #4
0
 def __init__(
     self,
     guess: str,
     labels: Optional[List[str]],
     prefixes: Optional[List[str]] = None,
     shared: Dict[str, Any] = None,
 ) -> None:
     super().__init__(shared=shared)
     self.prefixes = prefixes if prefixes else []
     bleu = BleuMetric.compute(guess, labels)
     f1 = F1Metric.compute(guess, labels)
     self.add_with_prefixes("nlg_bleu", bleu)
     self.add_with_prefixes("nlg_f1", f1)
Example #5
0
 def custom_evaluation(
     self,
     teacher_action: Message,
     labels: Optional[Tuple[str]],
     model_response: Message,
 ):
     if 'text' in model_response and 'checked_sentence' in teacher_action:
         self.metrics.add(
             'knowledge_f1',
             F1Metric.compute(model_response['text'],
                              [teacher_action['checked_sentence']]),
         )
     if 'text' in model_response and labels:
         self.metrics.add(
             'rare_word_f1',
             self.rare_word_f1.compute(model_response['text'], labels),
         )
Example #6
0
    def passes_filters(self, xturn: Dict[str, Any], yturn: Dict[str,
                                                                Any]) -> bool:
        """
        Subject example to various filters.

        Return whether the example passes all filters.

        :param xturn:
            context turn
        :param yturn:
            target/knowledge turn

        :return passes_filters:
            return whether the example passes the filters.
        """
        passes = True
        # Example filters
        knowledge = (yturn['knowledge'].replace(TOKEN_KNOWLEDGE, '').replace(
            TOKEN_END_KNOWLEDGE, '').strip())
        if passes and self.opt['skip_empty_context']:
            doc_context_sentences = [
                s for s in xturn['text'].split(TOKEN_KNOWLEDGE)[0].split('\n')
                if s
            ]
            passes &= (len(doc_context_sentences)
                       ) > 1  # All docs have <doc> token as their first line
        if passes and self.opt['min_knowledge_length'] > 0:
            passes &= len(
                knowledge.split(' ')) >= self.opt['min_knowledge_length']
        if passes and self.opt['min_knowledge_overlap'] > 0:
            assert 0 < self.opt['min_knowledge_overlap'] <= 1
            f1 = F1Metric.compute(yturn['text'].strip(), [knowledge])
            passes &= f1.value() >= self.opt['min_knowledge_overlap']
        if passes and self.opt['shared_knowledge_entity']:
            knol_ent = extract_entities(knowledge)
            if len(knol_ent) == 0:
                passes &= False
            label_ent = extract_entities(yturn.get('text'))
            ents = set(knol_ent).intersection(label_ent)
            if len(ents) == 0:
                passes &= False
        return passes
Example #7
0
    def get_best_doc(
        self, all_docs: List[Document], labels: List[str]
    ) -> Tuple[Optional[float], Optional[Document], Optional[int]]:
        """
        Given a set of all retrieved docs, determine best fitting document.

        :param all_docs:
            list of all retrieved Documents
        :param labels:
            labels for the current example

        :return (best_f1, best_doc, best_doc_idx):
            return the best document, along with the f1 overlap and index into all_docs
        """
        docs = []
        for i, d in enumerate(all_docs):
            if d.startswith('.'):
                d = d[2:]
            try:
                docs += [(i, s) for s in nltk.sent_tokenize(d)]
            except IndexError:
                # Something's up with the NLTK Sentence tokenizer here.
                docs += [(i, s) for s in d.split('.')]
        f1s, inds = torch.FloatTensor(
            [F1Metric.compute(labels[0], [d]).value() for _, d in docs]
        ).topk(len(docs))
        best_doc = None
        best_doc_idx = None
        best_f1 = None
        for f1, ind in zip(f1s, inds):
            if self.threshold < f1 < 1.0 and labels[0] not in docs[ind][1]:
                best_doc = docs[ind][1]
                best_doc_idx = docs[ind][0]
                best_f1 = f1.item()
                break

        return best_f1, best_doc, best_doc_idx
Example #8
0
    def _record_retrieval_metrics(self, batch: Batch,
                                  encoder_state: Tuple[Any, ...]):
        """
        Compute retrieval metrics, given retrieved documents.

        Only works when `--debug` is set.

        If there is knowledge in the Batch, we compute the following metrics:
        A) Doc Level:
        1. recall @ 1 --> is the correct document the first document?
        2. recall @ N --> is the correct document in the first N docs?

        B) Passage Level:
        1. recall @ 1 --> is the correct passage in the first document?
        2. recall @ N --> is the correct passage in the first N docs?

        Only works in debug mode.

        :param batch:
            training/eval batch
        :param encoder_state:
            encoder states from RagEncoder
        """
        if batch.valid_indices is None or batch.observations is None:
            return
        docs: List[List[Document]] = []
        _, _, input_turns_cnt, docs, _ = encoder_state
        if input_turns_cnt is not None:
            new_docs = []
            offset = 0
            for it in input_turns_cnt:
                docs_it = [dd for d in docs[offset:offset + it] for dd in d]
                new_docs.append(docs_it)
                offset += it
            docs = new_docs
        title_key = self.opt['gold_knowledge_title_key']
        passage_key = self.opt['gold_knowledge_passage_key']
        batchsize = len(batch.valid_indices)
        n_docs = self.opt['n_docs']
        metrics = {
            k: [0] * batchsize
            for k in [
                'doc_r@1',
                f'doc_r@{n_docs}',
                'passage_r@1',
                f'passage_r@{n_docs}',
                'title@1_f1',
                'passage@1_f1',
            ]
        }
        for i in range(batchsize):
            ex = batch.observations[i]
            label_title = normalize_answer(ex.get(title_key, ''))
            label_passage = normalize_answer(ex.get(passage_key, ''))

            for rank, doc in enumerate(docs[i]):
                model_title = normalize_answer(doc.get_title())
                model_passage = normalize_answer(doc.get_text())

                title_exact_match = model_title == label_title
                passage_match = (model_passage in label_passage
                                 or label_passage in model_passage)

                if rank == 0:
                    metrics['doc_r@1'][i] = int(title_exact_match)
                    metrics['passage_r@1'][i] = int(passage_match)
                    metrics['title@1_f1'][i] = F1Metric.compute(
                        guess=model_title, answers=[label_title]).value()
                    metrics['passage@1_f1'][i] = F1Metric.compute(
                        guess=model_passage, answers=[label_passage]).value()
                metrics[f'doc_r@{n_docs}'][i] = int(
                    metrics[f'doc_r@{n_docs}'][i] or title_exact_match)
                metrics[f'passage_r@{n_docs}'][i] = int(
                    metrics[f'passage_r@{n_docs}'][i] or passage_match)

        for m in metrics:
            self.record_local_metric(
                m, AverageMetric.many(metrics[m], [1] * batchsize))
Example #9
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ) -> None:
        """
        Various F1 metrics for the generated model response.
        """
        if not model_response.get('text'):
            # No response generated by model.
            return

        resp = model_response['text']
        # F1 metric over the *selected* knowledge.
        self.metrics.add(
            'knowledge_f1_docs',
            F1Metric.compute(resp, teacher_action[CONST.SELECTED_DOCS]),
        )
        self.metrics.add(
            'knowledge_f1_sentences',
            F1Metric.compute(resp, teacher_action[CONST.SELECTED_SENTENCES]),
        )

        # F1 Metrics over the *retrieved* docs.
        self.metrics.add(
            'f1_retrieved_docs',
            F1Metric.compute(resp,
                             ' '.join(teacher_action[CONST.RETRIEVED_DOCS])),
        )
        self.metrics.add(
            'max_f1_retrieved_docs',
            F1Metric.compute(resp, teacher_action[CONST.RETRIEVED_DOCS]),
        )

        selected_doc_senetences = teacher_action[CONST.SELECTED_DOCS][0].split(
            '\n')
        all_doc_senetences = []
        for doc in teacher_action[CONST.RETRIEVED_DOCS]:
            all_doc_senetences.extend(doc.split('\n'))

        self.metrics.add('exact_copied_sentences',
                         ExactMatchMetric.compute(resp, all_doc_senetences))
        self.metrics.add(
            'max_substring_copied_sentences',
            CopiedSubstringMetric.compute(resp, all_doc_senetences),
        )
        self.metrics.add(
            'max_substring_copied_docs',
            CopiedSubstringMetric.compute(
                resp, teacher_action[CONST.RETRIEVED_DOCS]),
        )
        self.metrics.add(
            'substring_copied_docs',
            CopiedSubstringMetric.compute(
                resp, [''.join(teacher_action[CONST.RETRIEVED_DOCS])]),
        )
        self.metrics.add(
            'max_f1_selected_docs_senetences',
            F1Metric.compute(resp, selected_doc_senetences),
        )
        self.metrics.add('max_f1_docs_senetences',
                         F1Metric.compute(resp, all_doc_senetences))

        # N-gram matching metrics
        for k in range(1, 5):  # 1..4
            self.metrics.add(
                f'max_bleu_selected_docs_senetences-{k}',
                BleuMetric.compute(resp, selected_doc_senetences, k),
            )

        r1, r2, rL = RougeMetric.compute_many(resp, selected_doc_senetences)
        self.metrics.add('max_rouge_selected_docs_senetences_1', r1)
        self.metrics.add('max_rouge_selected_docs_senetences_2', r2)
        self.metrics.add('max_rouge_selected_docs_senetences_L', rL)
Example #10
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ):
        """
        Custom Evaluations for Wizard of Wikipedia.

        When the label is `chosen_sent`, evaluate whether the model response...
        1) Is the correct document (title)
        2) _contains_ the correct chosen sentence (even if it's not wholly the answer)

        When the label is `response`, we compute F1 of model generation w.r.t checked sentence.

        :param teacher_action:
            The message last sent from this teacher.
        :param labels:
            The previous correct labels, if there were any.
        :param model_response:
            The raw response from the model. Generally you want to rely on the
            text field, but others may be necessary in specific situations.
        """
        if (self.label_type == 'response' and 'text' in model_response
                and 'checked_sentence' in teacher_action):
            self.metrics.add(
                'knowledge_f1',
                F1Metric.compute(model_response['text'],
                                 [teacher_action['checked_sentence']]),
            )
            if labels:
                self.metrics.add(
                    'rare_word_f1',
                    self.rare_word_f1.compute(model_response['text'], labels),
                )
        elif (self.label_type == 'chosen_sent'
              and TOKEN_KNOWLEDGE in model_response['text']):
            try:
                correct_title, correct_passage = [
                    normalize_answer(a)
                    for a in labels[0].split(TOKEN_KNOWLEDGE)
                ]
            except ValueError:
                # Knowledge not chosen
                correct_title, correct_passage = TOKEN_NOCHOSEN, TOKEN_NOCHOSEN
            title, passage = [
                normalize_answer(a)
                for a in model_response['text'].split(TOKEN_KNOWLEDGE)
            ]

            self.metrics.add('title_r@1',
                             AverageMetric(int(correct_title == title)))
            self.metrics.add('passage_r@1',
                             AverageMetric(int(correct_passage in passage)))
            if 'title_candidates' in model_response:
                title_candidates = [
                    normalize_answer(t)
                    for t in model_response['title_candidates']
                ][:5]
                self.metrics.add(
                    'title_r@5',
                    AverageMetric(
                        int(any(correct_title == t
                                for t in title_candidates))),
                )
            if 'text_candidates' in model_response:
                text_candidates = [
                    normalize_answer(t)
                    for t in model_response['text_candidates']
                ][:5]
                self.metrics.add(
                    'passage_r@5',
                    AverageMetric(
                        int(any(correct_passage in t
                                for t in text_candidates))),
                )
    def test_custom_eval(self):
        """
        Test whether custom evaluation works.
        """
        with testing_utils.capture_output():
            parser = setup_args()
            opt = parser.parse_args([
                '--task',
                'wizard_of_wikipedia',
                '--datatype',
                'valid',
                '--label-type',
                'chosen_sent',
            ])
            teacher = create_task_agent_from_taskname(opt)[0]

        title = 'Gardening'
        cands = list('four')

        text = "Gardening\nI like Gardening, even when I've only been doing it for a short time."
        response = 'I live on a farm, we garden all year long, it is very relaxing.'
        checked_sent = (
            'Gardening is considered by many people to be a relaxing activity.'
        )
        checked_sent_label = f'{title}{TOKEN_KNOWLEDGE}{checked_sent}'

        retrieval_metric_keys = [
            'passage_r@1', 'passage_r@5', 'title_r@1', 'title_r@5'
        ]

        chosen_sent_teacher_action = Message({
            'text':
            text,
            'labels': [checked_sent_label],
            'title': [title],
            'checked_sentence': [checked_sent],
        })
        correct_chosen_sent_response = Message({
            'text':
            checked_sent_label,
            'title_candidates': [title] + cands,
            'text_candidates': [checked_sent_label] + cands,
        })
        top5_chosen_sent_response = Message({
            'text':
            f'hello{TOKEN_KNOWLEDGE}goodbye',
            'title_candidates':
            cands + [title],
            'text_candidates':
            cands + [checked_sent_label],
        })
        incorrect_chosen_sent_response = Message({
            'text': f'hello{TOKEN_KNOWLEDGE}goodbye',
            'title_candidates': cands,
            'text_candidates': cands,
        })

        response_teacher_action = Message({
            'text': text,
            'labels': [response],
            'checked_sentence': checked_sent
        })
        high_f1_response = Message({'text': checked_sent})
        low_f1_response = Message({'text': 'incorrect'})

        # 1) Test with correct top sentence
        teacher.reset_metrics()
        teacher.custom_evaluation(
            chosen_sent_teacher_action,
            [checked_sent_label],
            correct_chosen_sent_response,
        )
        report = teacher.report()
        for k in retrieval_metric_keys:
            assert k in report
            assert report[k] == AverageMetric(1)

        # 2) Test with top sentence in top 5
        teacher.reset_metrics()
        teacher.custom_evaluation(chosen_sent_teacher_action,
                                  [checked_sent_label],
                                  top5_chosen_sent_response)
        report = teacher.report()
        for k in retrieval_metric_keys:
            assert k in report
            assert report[k] == AverageMetric(
                1) if '5' in k else AverageMetric(0)

        # 3) Test with no top sentences
        teacher.reset_metrics()
        teacher.custom_evaluation(
            chosen_sent_teacher_action,
            [checked_sent_label],
            incorrect_chosen_sent_response,
        )
        report = teacher.report()
        for k in retrieval_metric_keys:
            assert k in report
            assert report[k] == AverageMetric(0)

        # 4) Test knowledge f1 with high f1
        teacher.label_type = 'response'
        teacher.reset_metrics()
        teacher.custom_evaluation(response_teacher_action, [response],
                                  high_f1_response)
        report = teacher.report()
        assert 'knowledge_f1' in report
        assert report['knowledge_f1'] == F1Metric(1)

        # 5) Test knowledge f1 with low f1
        teacher.reset_metrics()
        teacher.custom_evaluation(response_teacher_action, [response],
                                  low_f1_response)
        report = teacher.report()
        assert 'knowledge_f1' in report
        assert report['knowledge_f1'] == F1Metric(0)
Example #12
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ):
        if 'metrics' in model_response and 'type' in teacher_action:
            # keep copies of metrics across both api calls/responses
            prefix = teacher_action['type']
            keys = list(model_response['metrics'].keys())
            for k in keys:
                self.metrics.add(f'{prefix}_{k}', model_response['metrics'][k])

        if 'text' not in model_response or not labels or 'type' not in teacher_action:
            return

        domain = teacher_action['domain']

        if teacher_action['type'] == 'apicall':
            # also count slot accuracy
            text = model_response['text']
            slot_guesses = set(
                text.replace(
                    CALL_TOKEN + " ",
                    "").split(' ; '))  # prevent cheating via repeated guesses
            correct = 0
            for slot_guess in slot_guesses:
                if ' = ' not in slot_guess:
                    continue
                try:
                    slot, guess = slot_guess.split(' = ')
                except ValueError:
                    continue
                if teacher_action['slots'].get(slot) == guess:
                    self.metrics.add('slot_p', AverageMetric(1))
                    self.metrics.add(f'{domain}_slot_p', AverageMetric(1))
                    correct += 1
                else:
                    self.metrics.add('slot_p', AverageMetric(0))
                    self.metrics.add(f'{domain}_slot_p', AverageMetric(0))
                    logging.debug(
                        f"Bad slot guess '{slot_guess}' != {teacher_action['slots']}"
                    )
            if teacher_action['slots']:
                self.metrics.add(
                    'slot_r',
                    AverageMetric(correct, len(teacher_action['slots'])))
                self.metrics.add(
                    f'{domain}_slot_r',
                    AverageMetric(correct, len(teacher_action['slots'])),
                )
                self.metrics.add(
                    'jga',
                    AverageMetric(correct == len(teacher_action['slots'])))

        elif teacher_action['type'] == 'apiresp':
            # keep track of statistics by domain
            f1_metric = F1Metric.compute(model_response['text'], labels)
            bleu_metric = BleuMetric.compute(model_response['text'], labels)
            self.metrics.add(f'{domain}_lex_f1', f1_metric)
            self.metrics.add(f'{domain}_lex_bleu', bleu_metric)

            delex_text = model_response['text']
            delex_label = labels[0]
            # compute delexicalized string metrics
            for slot, value in teacher_action['slots'].items():
                delex_text = delex_text.replace(value, slot)
                delex_label = delex_label.replace(value, slot)
            f1_metric = F1Metric.compute(delex_text, (delex_label, ))
            self.metrics.add('delex_f1', f1_metric)
            self.metrics.add(f'{domain}_delex_f1', f1_metric)
            bleu_metric = BleuMetric.compute(delex_text, [delex_label])
            self.metrics.add('delex_bleu', bleu_metric)
            self.metrics.add(f'{domain}_delex_bleu', bleu_metric)