def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ) -> None: if ((teacher_action[CONST.SELECTED_SENTENCES][0] == CONST.NO_SELECTED_SENTENCES_TOKEN) or (model_response.is_padding()) or ('text' not in model_response)): # Has NOT selected knowledge or a is batch padding message return resp = model_response['text'] self.metrics.add( 'knowledge_f1_docs', F1Metric.compute(resp, [' '.join(teacher_action[CONST.SELECTED_DOCS])]), ) self.metrics.add('knowledge_f1_max_docs', F1Metric.compute(resp, CONST.SELECTED_DOCS)) self.metrics.add( 'knowledge_f1_sentences', F1Metric.compute( resp, [' '.join(teacher_action[CONST.SELECTED_SENTENCES])]), ) self.metrics.add( 'knowledge_f1_max_sentences', F1Metric.compute(resp, CONST.SELECTED_SENTENCES), )
def compute(self, guess: str, answers: Iterable[str]) -> F1Metric: if guess is None or answers is None: return F1Metric(0, 0) guess = RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, guess) answers = [ RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, a) for a in answers ] if not any(len(a) for a in answers): # no rare words in labels, set denominator to zero return F1Metric(0, 0) return F1Metric.compute(guess, answers)
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ) -> None: if model_response.is_padding() or (not model_response.get( 'text', None)): return expected_graph = break_knowledge_graph(labels[0].lower()) predicted_graph = break_knowledge_graph(model_response['text'].lower()) # Encoding the graph edges/mutation operations into ints for readily use of F1Metric expected_graph_enc, predicted_graph_enc = encode_set_elements( expected_graph, predicted_graph) self.metrics.add( 'response_elements_f1', F1Metric.compute( guess=' '.join(predicted_graph_enc), answers=[' '.join(expected_graph_enc)], ), ) # Subject, Relation F1 # Changind "(MUT) < you , in , house >" --into--> "(MUT) < you , in " # This is to check F1 for the predicted subject and relation overlap. ekg_sub_rel = set([e.rsplit(',', 1)[0] for e in expected_graph]) pkg_sub_rel = set([e.rsplit(',', 1)[0] for e in predicted_graph]) ekg_sub_rel_ids, pkg_sub_rel_ids = encode_set_elements( ekg_sub_rel, pkg_sub_rel) self.metrics.add( 'graph_subject_relation_f1', F1Metric.compute(guess=' '.join(pkg_sub_rel_ids), answers=[' '.join(ekg_sub_rel_ids)]), ) # Subject F1 # Changind "(MUT) < you , in " (produced above) --into--> "(MUT) < you " # This is to check F1 for the predicted subject overlap. ekg_sub = set([e.split(',')[0] for e in ekg_sub_rel]) pkg_sub = set([e.split(',')[0] for e in pkg_sub_rel]) ekg_sub_ids, pkg_sub_ids = encode_set_elements(ekg_sub, pkg_sub) self.metrics.add( 'graph_subject_f1', F1Metric.compute(guess=' '.join(pkg_sub_ids), answers=[' '.join(ekg_sub_ids)]), )
def __init__( self, guess: str, labels: Optional[List[str]], prefixes: Optional[List[str]] = None, shared: Dict[str, Any] = None, ) -> None: super().__init__(shared=shared) self.prefixes = prefixes if prefixes else [] bleu = BleuMetric.compute(guess, labels) f1 = F1Metric.compute(guess, labels) self.add_with_prefixes("nlg_bleu", bleu) self.add_with_prefixes("nlg_f1", f1)
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ): if 'text' in model_response and 'checked_sentence' in teacher_action: self.metrics.add( 'knowledge_f1', F1Metric.compute(model_response['text'], [teacher_action['checked_sentence']]), ) if 'text' in model_response and labels: self.metrics.add( 'rare_word_f1', self.rare_word_f1.compute(model_response['text'], labels), )
def passes_filters(self, xturn: Dict[str, Any], yturn: Dict[str, Any]) -> bool: """ Subject example to various filters. Return whether the example passes all filters. :param xturn: context turn :param yturn: target/knowledge turn :return passes_filters: return whether the example passes the filters. """ passes = True # Example filters knowledge = (yturn['knowledge'].replace(TOKEN_KNOWLEDGE, '').replace( TOKEN_END_KNOWLEDGE, '').strip()) if passes and self.opt['skip_empty_context']: doc_context_sentences = [ s for s in xturn['text'].split(TOKEN_KNOWLEDGE)[0].split('\n') if s ] passes &= (len(doc_context_sentences) ) > 1 # All docs have <doc> token as their first line if passes and self.opt['min_knowledge_length'] > 0: passes &= len( knowledge.split(' ')) >= self.opt['min_knowledge_length'] if passes and self.opt['min_knowledge_overlap'] > 0: assert 0 < self.opt['min_knowledge_overlap'] <= 1 f1 = F1Metric.compute(yturn['text'].strip(), [knowledge]) passes &= f1.value() >= self.opt['min_knowledge_overlap'] if passes and self.opt['shared_knowledge_entity']: knol_ent = extract_entities(knowledge) if len(knol_ent) == 0: passes &= False label_ent = extract_entities(yturn.get('text')) ents = set(knol_ent).intersection(label_ent) if len(ents) == 0: passes &= False return passes
def get_best_doc( self, all_docs: List[Document], labels: List[str] ) -> Tuple[Optional[float], Optional[Document], Optional[int]]: """ Given a set of all retrieved docs, determine best fitting document. :param all_docs: list of all retrieved Documents :param labels: labels for the current example :return (best_f1, best_doc, best_doc_idx): return the best document, along with the f1 overlap and index into all_docs """ docs = [] for i, d in enumerate(all_docs): if d.startswith('.'): d = d[2:] try: docs += [(i, s) for s in nltk.sent_tokenize(d)] except IndexError: # Something's up with the NLTK Sentence tokenizer here. docs += [(i, s) for s in d.split('.')] f1s, inds = torch.FloatTensor( [F1Metric.compute(labels[0], [d]).value() for _, d in docs] ).topk(len(docs)) best_doc = None best_doc_idx = None best_f1 = None for f1, ind in zip(f1s, inds): if self.threshold < f1 < 1.0 and labels[0] not in docs[ind][1]: best_doc = docs[ind][1] best_doc_idx = docs[ind][0] best_f1 = f1.item() break return best_f1, best_doc, best_doc_idx
def _record_retrieval_metrics(self, batch: Batch, encoder_state: Tuple[Any, ...]): """ Compute retrieval metrics, given retrieved documents. Only works when `--debug` is set. If there is knowledge in the Batch, we compute the following metrics: A) Doc Level: 1. recall @ 1 --> is the correct document the first document? 2. recall @ N --> is the correct document in the first N docs? B) Passage Level: 1. recall @ 1 --> is the correct passage in the first document? 2. recall @ N --> is the correct passage in the first N docs? Only works in debug mode. :param batch: training/eval batch :param encoder_state: encoder states from RagEncoder """ if batch.valid_indices is None or batch.observations is None: return docs: List[List[Document]] = [] _, _, input_turns_cnt, docs, _ = encoder_state if input_turns_cnt is not None: new_docs = [] offset = 0 for it in input_turns_cnt: docs_it = [dd for d in docs[offset:offset + it] for dd in d] new_docs.append(docs_it) offset += it docs = new_docs title_key = self.opt['gold_knowledge_title_key'] passage_key = self.opt['gold_knowledge_passage_key'] batchsize = len(batch.valid_indices) n_docs = self.opt['n_docs'] metrics = { k: [0] * batchsize for k in [ 'doc_r@1', f'doc_r@{n_docs}', 'passage_r@1', f'passage_r@{n_docs}', 'title@1_f1', 'passage@1_f1', ] } for i in range(batchsize): ex = batch.observations[i] label_title = normalize_answer(ex.get(title_key, '')) label_passage = normalize_answer(ex.get(passage_key, '')) for rank, doc in enumerate(docs[i]): model_title = normalize_answer(doc.get_title()) model_passage = normalize_answer(doc.get_text()) title_exact_match = model_title == label_title passage_match = (model_passage in label_passage or label_passage in model_passage) if rank == 0: metrics['doc_r@1'][i] = int(title_exact_match) metrics['passage_r@1'][i] = int(passage_match) metrics['title@1_f1'][i] = F1Metric.compute( guess=model_title, answers=[label_title]).value() metrics['passage@1_f1'][i] = F1Metric.compute( guess=model_passage, answers=[label_passage]).value() metrics[f'doc_r@{n_docs}'][i] = int( metrics[f'doc_r@{n_docs}'][i] or title_exact_match) metrics[f'passage_r@{n_docs}'][i] = int( metrics[f'passage_r@{n_docs}'][i] or passage_match) for m in metrics: self.record_local_metric( m, AverageMetric.many(metrics[m], [1] * batchsize))
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ) -> None: """ Various F1 metrics for the generated model response. """ if not model_response.get('text'): # No response generated by model. return resp = model_response['text'] # F1 metric over the *selected* knowledge. self.metrics.add( 'knowledge_f1_docs', F1Metric.compute(resp, teacher_action[CONST.SELECTED_DOCS]), ) self.metrics.add( 'knowledge_f1_sentences', F1Metric.compute(resp, teacher_action[CONST.SELECTED_SENTENCES]), ) # F1 Metrics over the *retrieved* docs. self.metrics.add( 'f1_retrieved_docs', F1Metric.compute(resp, ' '.join(teacher_action[CONST.RETRIEVED_DOCS])), ) self.metrics.add( 'max_f1_retrieved_docs', F1Metric.compute(resp, teacher_action[CONST.RETRIEVED_DOCS]), ) selected_doc_senetences = teacher_action[CONST.SELECTED_DOCS][0].split( '\n') all_doc_senetences = [] for doc in teacher_action[CONST.RETRIEVED_DOCS]: all_doc_senetences.extend(doc.split('\n')) self.metrics.add('exact_copied_sentences', ExactMatchMetric.compute(resp, all_doc_senetences)) self.metrics.add( 'max_substring_copied_sentences', CopiedSubstringMetric.compute(resp, all_doc_senetences), ) self.metrics.add( 'max_substring_copied_docs', CopiedSubstringMetric.compute( resp, teacher_action[CONST.RETRIEVED_DOCS]), ) self.metrics.add( 'substring_copied_docs', CopiedSubstringMetric.compute( resp, [''.join(teacher_action[CONST.RETRIEVED_DOCS])]), ) self.metrics.add( 'max_f1_selected_docs_senetences', F1Metric.compute(resp, selected_doc_senetences), ) self.metrics.add('max_f1_docs_senetences', F1Metric.compute(resp, all_doc_senetences)) # N-gram matching metrics for k in range(1, 5): # 1..4 self.metrics.add( f'max_bleu_selected_docs_senetences-{k}', BleuMetric.compute(resp, selected_doc_senetences, k), ) r1, r2, rL = RougeMetric.compute_many(resp, selected_doc_senetences) self.metrics.add('max_rouge_selected_docs_senetences_1', r1) self.metrics.add('max_rouge_selected_docs_senetences_2', r2) self.metrics.add('max_rouge_selected_docs_senetences_L', rL)
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ): """ Custom Evaluations for Wizard of Wikipedia. When the label is `chosen_sent`, evaluate whether the model response... 1) Is the correct document (title) 2) _contains_ the correct chosen sentence (even if it's not wholly the answer) When the label is `response`, we compute F1 of model generation w.r.t checked sentence. :param teacher_action: The message last sent from this teacher. :param labels: The previous correct labels, if there were any. :param model_response: The raw response from the model. Generally you want to rely on the text field, but others may be necessary in specific situations. """ if (self.label_type == 'response' and 'text' in model_response and 'checked_sentence' in teacher_action): self.metrics.add( 'knowledge_f1', F1Metric.compute(model_response['text'], [teacher_action['checked_sentence']]), ) if labels: self.metrics.add( 'rare_word_f1', self.rare_word_f1.compute(model_response['text'], labels), ) elif (self.label_type == 'chosen_sent' and TOKEN_KNOWLEDGE in model_response['text']): try: correct_title, correct_passage = [ normalize_answer(a) for a in labels[0].split(TOKEN_KNOWLEDGE) ] except ValueError: # Knowledge not chosen correct_title, correct_passage = TOKEN_NOCHOSEN, TOKEN_NOCHOSEN title, passage = [ normalize_answer(a) for a in model_response['text'].split(TOKEN_KNOWLEDGE) ] self.metrics.add('title_r@1', AverageMetric(int(correct_title == title))) self.metrics.add('passage_r@1', AverageMetric(int(correct_passage in passage))) if 'title_candidates' in model_response: title_candidates = [ normalize_answer(t) for t in model_response['title_candidates'] ][:5] self.metrics.add( 'title_r@5', AverageMetric( int(any(correct_title == t for t in title_candidates))), ) if 'text_candidates' in model_response: text_candidates = [ normalize_answer(t) for t in model_response['text_candidates'] ][:5] self.metrics.add( 'passage_r@5', AverageMetric( int(any(correct_passage in t for t in text_candidates))), )
def test_custom_eval(self): """ Test whether custom evaluation works. """ with testing_utils.capture_output(): parser = setup_args() opt = parser.parse_args([ '--task', 'wizard_of_wikipedia', '--datatype', 'valid', '--label-type', 'chosen_sent', ]) teacher = create_task_agent_from_taskname(opt)[0] title = 'Gardening' cands = list('four') text = "Gardening\nI like Gardening, even when I've only been doing it for a short time." response = 'I live on a farm, we garden all year long, it is very relaxing.' checked_sent = ( 'Gardening is considered by many people to be a relaxing activity.' ) checked_sent_label = f'{title}{TOKEN_KNOWLEDGE}{checked_sent}' retrieval_metric_keys = [ 'passage_r@1', 'passage_r@5', 'title_r@1', 'title_r@5' ] chosen_sent_teacher_action = Message({ 'text': text, 'labels': [checked_sent_label], 'title': [title], 'checked_sentence': [checked_sent], }) correct_chosen_sent_response = Message({ 'text': checked_sent_label, 'title_candidates': [title] + cands, 'text_candidates': [checked_sent_label] + cands, }) top5_chosen_sent_response = Message({ 'text': f'hello{TOKEN_KNOWLEDGE}goodbye', 'title_candidates': cands + [title], 'text_candidates': cands + [checked_sent_label], }) incorrect_chosen_sent_response = Message({ 'text': f'hello{TOKEN_KNOWLEDGE}goodbye', 'title_candidates': cands, 'text_candidates': cands, }) response_teacher_action = Message({ 'text': text, 'labels': [response], 'checked_sentence': checked_sent }) high_f1_response = Message({'text': checked_sent}) low_f1_response = Message({'text': 'incorrect'}) # 1) Test with correct top sentence teacher.reset_metrics() teacher.custom_evaluation( chosen_sent_teacher_action, [checked_sent_label], correct_chosen_sent_response, ) report = teacher.report() for k in retrieval_metric_keys: assert k in report assert report[k] == AverageMetric(1) # 2) Test with top sentence in top 5 teacher.reset_metrics() teacher.custom_evaluation(chosen_sent_teacher_action, [checked_sent_label], top5_chosen_sent_response) report = teacher.report() for k in retrieval_metric_keys: assert k in report assert report[k] == AverageMetric( 1) if '5' in k else AverageMetric(0) # 3) Test with no top sentences teacher.reset_metrics() teacher.custom_evaluation( chosen_sent_teacher_action, [checked_sent_label], incorrect_chosen_sent_response, ) report = teacher.report() for k in retrieval_metric_keys: assert k in report assert report[k] == AverageMetric(0) # 4) Test knowledge f1 with high f1 teacher.label_type = 'response' teacher.reset_metrics() teacher.custom_evaluation(response_teacher_action, [response], high_f1_response) report = teacher.report() assert 'knowledge_f1' in report assert report['knowledge_f1'] == F1Metric(1) # 5) Test knowledge f1 with low f1 teacher.reset_metrics() teacher.custom_evaluation(response_teacher_action, [response], low_f1_response) report = teacher.report() assert 'knowledge_f1' in report assert report['knowledge_f1'] == F1Metric(0)
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ): if 'metrics' in model_response and 'type' in teacher_action: # keep copies of metrics across both api calls/responses prefix = teacher_action['type'] keys = list(model_response['metrics'].keys()) for k in keys: self.metrics.add(f'{prefix}_{k}', model_response['metrics'][k]) if 'text' not in model_response or not labels or 'type' not in teacher_action: return domain = teacher_action['domain'] if teacher_action['type'] == 'apicall': # also count slot accuracy text = model_response['text'] slot_guesses = set( text.replace( CALL_TOKEN + " ", "").split(' ; ')) # prevent cheating via repeated guesses correct = 0 for slot_guess in slot_guesses: if ' = ' not in slot_guess: continue try: slot, guess = slot_guess.split(' = ') except ValueError: continue if teacher_action['slots'].get(slot) == guess: self.metrics.add('slot_p', AverageMetric(1)) self.metrics.add(f'{domain}_slot_p', AverageMetric(1)) correct += 1 else: self.metrics.add('slot_p', AverageMetric(0)) self.metrics.add(f'{domain}_slot_p', AverageMetric(0)) logging.debug( f"Bad slot guess '{slot_guess}' != {teacher_action['slots']}" ) if teacher_action['slots']: self.metrics.add( 'slot_r', AverageMetric(correct, len(teacher_action['slots']))) self.metrics.add( f'{domain}_slot_r', AverageMetric(correct, len(teacher_action['slots'])), ) self.metrics.add( 'jga', AverageMetric(correct == len(teacher_action['slots']))) elif teacher_action['type'] == 'apiresp': # keep track of statistics by domain f1_metric = F1Metric.compute(model_response['text'], labels) bleu_metric = BleuMetric.compute(model_response['text'], labels) self.metrics.add(f'{domain}_lex_f1', f1_metric) self.metrics.add(f'{domain}_lex_bleu', bleu_metric) delex_text = model_response['text'] delex_label = labels[0] # compute delexicalized string metrics for slot, value in teacher_action['slots'].items(): delex_text = delex_text.replace(value, slot) delex_label = delex_label.replace(value, slot) f1_metric = F1Metric.compute(delex_text, (delex_label, )) self.metrics.add('delex_f1', f1_metric) self.metrics.add(f'{domain}_delex_f1', f1_metric) bleu_metric = BleuMetric.compute(delex_text, [delex_label]) self.metrics.add('delex_bleu', bleu_metric) self.metrics.add(f'{domain}_delex_bleu', bleu_metric)