def compute(guess: str, answers: List[str]) -> Optional["InMetric"]: if guess is None or answers is None: return None guess = normalize_answer(guess) for a in answers: if normalize_answer(a) in guess: return InMetric(1) return InMetric(0)
def compute(guess: str, answers: List[str]): if guess is None or answers is None: return None guess = normalize_answer(guess) for a in answers: if guess in normalize_answer(a): return CopiedSubstringMetric(1) return CopiedSubstringMetric(0)
def _filter(freq_dist, cutoff: int, text: str) -> str: """ For words that are found in the reference distribution, filters those with an occurrence count less than the cutoff. """ words = normalize_answer(text).split() return " ".join([w for w in words if freq_dist.get(w, cutoff) < cutoff])
def __init__(self, corpus: str, top_p: float = 0.5): try: import nltk except ImportError: raise ImportError('Please install nltk (e.g. pip install nltk).') words = normalize_answer(corpus).split() self._freq_dist = nltk.FreqDist(words) self._cutoff_count = RareWordF1Calculator._find_cutoff_count( self._freq_dist, top_p)
def setup_data(self, fold): with PathManager.open( os.path.join(self.dpath, self.fold + "_with_gold.json")) as json_file: gold_datas = json.load(json_file)["data"] for gold_data in gold_datas: text = gold_data["question"] label = gold_data["short_answers"][0] if self.opt.get("normalize_everything"): text = normalize_answer(text) label = normalize_answer(label) yield { 'text': text, 'label': label, 'title': gold_data['title'], 'checked_sentence': gold_data['context'], 'answers': json.dumps(gold_data["short_answers"]), }, True
def process_prediction(prediction, word_statistics): word_statistics['pred_list'].append(normalize_answer(prediction)) freqs, _cnt, wlength, clength = get_word_stats(prediction, dictionary, bins=bins) word_statistics['word_cnt'] += _cnt word_statistics['mean_wlength'].append(wlength) word_statistics['mean_clength'].append(clength) word_statistics['freqs_cnt'] += Counter(freqs) return word_statistics
def process_prediction(prediction, word_statistics): normalized = normalize_answer(prediction) word_statistics['pred_list'].append(normalized) freqs, _cnt, wlength, clength = get_word_stats(prediction, dictionary, bins=bins) word_statistics['word_cnt'] += _cnt word_statistics['mean_wlength'].append(wlength) word_statistics['mean_clength'].append(clength) word_statistics['freqs_cnt'] += Counter(freqs) word_statistics['unique_words'] |= set(normalized.split(" ")) return word_statistics
def handle_message_helper( self, prefix_stripped_text: str) -> Optional[Dict[str, Metric]]: here = [normalize_answer(x) for x in prefix_stripped_text.split(" ")] score = 1 if len(self.turns) > 0: score = nltkbleu.corpus_bleu( [self.turns], [here], smoothing_function=nltkbleu.SmoothingFunction( epsilon=1e-12).method1, weights=[1.0 / 3.0] * 3, ) self.turns.append(here) return {self.metric_key(): BleuMetric(score)}
def _record_retrieval_metrics(self, batch: Batch, encoder_state: Tuple[Any, ...]): """ Compute retrieval metrics, given retrieved documents. Only works when `--debug` is set. If there is knowledge in the Batch, we compute the following metrics: A) Doc Level: 1. recall @ 1 --> is the correct document the first document? 2. recall @ N --> is the correct document in the first N docs? B) Passage Level: 1. recall @ 1 --> is the correct passage in the first document? 2. recall @ N --> is the correct passage in the first N docs? Only works in debug mode. :param batch: training/eval batch :param encoder_state: encoder states from RagEncoder """ if batch.valid_indices is None or batch.observations is None: return docs: List[List[Document]] = [] _, _, input_turns_cnt, docs, _ = encoder_state if input_turns_cnt is not None: new_docs = [] offset = 0 for it in input_turns_cnt: docs_it = [dd for d in docs[offset:offset + it] for dd in d] new_docs.append(docs_it) offset += it docs = new_docs title_key = self.opt['gold_knowledge_title_key'] passage_key = self.opt['gold_knowledge_passage_key'] batchsize = len(batch.valid_indices) n_docs = self.opt['n_docs'] metrics = { k: [0] * batchsize for k in [ 'doc_r@1', f'doc_r@{n_docs}', 'passage_r@1', f'passage_r@{n_docs}', 'title@1_f1', 'passage@1_f1', ] } for i in range(batchsize): ex = batch.observations[i] label_title = normalize_answer(ex.get(title_key, '')) label_passage = normalize_answer(ex.get(passage_key, '')) for rank, doc in enumerate(docs[i]): model_title = normalize_answer(doc.get_title()) model_passage = normalize_answer(doc.get_text()) title_exact_match = model_title == label_title passage_match = (model_passage in label_passage or label_passage in model_passage) if rank == 0: metrics['doc_r@1'][i] = int(title_exact_match) metrics['passage_r@1'][i] = int(passage_match) metrics['title@1_f1'][i] = F1Metric.compute( guess=model_title, answers=[label_title]).value() metrics['passage@1_f1'][i] = F1Metric.compute( guess=model_passage, answers=[label_passage]).value() metrics[f'doc_r@{n_docs}'][i] = int( metrics[f'doc_r@{n_docs}'][i] or title_exact_match) metrics[f'passage_r@{n_docs}'][i] = int( metrics[f'passage_r@{n_docs}'][i] or passage_match) for m in metrics: self.record_local_metric( m, AverageMetric.many(metrics[m], [1] * batchsize))
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ): """ Custom Evaluations for Wizard of Wikipedia. When the label is `chosen_sent`, evaluate whether the model response... 1) Is the correct document (title) 2) _contains_ the correct chosen sentence (even if it's not wholly the answer) When the label is `response`, we compute F1 of model generation w.r.t checked sentence. :param teacher_action: The message last sent from this teacher. :param labels: The previous correct labels, if there were any. :param model_response: The raw response from the model. Generally you want to rely on the text field, but others may be necessary in specific situations. """ if (self.label_type == 'response' and 'text' in model_response and 'checked_sentence' in teacher_action): self.metrics.add( 'knowledge_f1', F1Metric.compute(model_response['text'], [teacher_action['checked_sentence']]), ) if labels: self.metrics.add( 'rare_word_f1', self.rare_word_f1.compute(model_response['text'], labels), ) elif (self.label_type == 'chosen_sent' and TOKEN_KNOWLEDGE in model_response['text']): try: correct_title, correct_passage = [ normalize_answer(a) for a in labels[0].split(TOKEN_KNOWLEDGE) ] except ValueError: # Knowledge not chosen correct_title, correct_passage = TOKEN_NOCHOSEN, TOKEN_NOCHOSEN title, passage = [ normalize_answer(a) for a in model_response['text'].split(TOKEN_KNOWLEDGE) ] self.metrics.add('title_r@1', AverageMetric(int(correct_title == title))) self.metrics.add('passage_r@1', AverageMetric(int(correct_passage in passage))) if 'title_candidates' in model_response: title_candidates = [ normalize_answer(t) for t in model_response['title_candidates'] ][:5] self.metrics.add( 'title_r@5', AverageMetric( int(any(correct_title == t for t in title_candidates))), ) if 'text_candidates' in model_response: text_candidates = [ normalize_answer(t) for t in model_response['text_candidates'] ][:5] self.metrics.add( 'passage_r@5', AverageMetric( int(any(correct_passage in t for t in text_candidates))), )
def eval_wordstat(opt, print_parser=None): """Evaluates a model. Arguments: opt -- tells the evaluation function how to run print_parser -- if provided, prints the options that are set within the model after loading the model """ random.seed(42) # Create model and assign it to the specified task agent = create_agent(opt, requireModelExists=True) world = create_task(opt, agent) if opt.get('external_dict'): print('[ Using external dictionary from: {} ]'.format( opt['external_dict'])) dict_opt = copy.deepcopy(opt) dict_opt['dict_file'] = opt['external_dict'] dictionary = DictionaryAgent(dict_opt) else: print('[ Using model bundled dictionary ]') dictionary = agent.dict if print_parser: # Show arguments after loading model print_parser.opt = agent.opt print_parser.print_args() log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() cnt = 0 mean_wlength = [] mean_clength = [] freqs_cnt = Counter() word_cnt = 0 bins = [int(i) for i in opt['freq_bins'].split(',')] pred_list = [] while not world.epoch_done(): cnt += 1 world.parley() prediction = world.acts[-1]['text'] pred_list.append(normalize_answer(prediction)) freqs, _cnt, wlength, clength = get_word_stats(prediction, dictionary, bins=bins) word_cnt += _cnt mean_wlength.append(wlength) mean_clength.append(clength) freqs_cnt += Counter(freqs) if log_time.time() > log_every_n_secs or ( opt['num_examples'] > 0 and cnt >= opt['num_examples']) or world.epoch_done(): report = world.report() text, report = log_time.log(report['exs'], world.num_examples(), report) print(text) stat_str = 'total_words: {}, '.format(word_cnt) + ', '.join([ '<{}:{} ({:.{prec}f}%)'.format( b, freqs_cnt.get(b, 0), (freqs_cnt.get(b, 0) / word_cnt) * 100, prec=2) for b in bins ]) print( "Word statistics: {}, avg_word_length: {:.{prec}f}, avg_char_length: {:.{prec}f}" .format(stat_str, numpy.array(mean_wlength).mean(), numpy.array(mean_clength).mean(), prec=2)) if opt['num_examples'] > 0 and cnt >= opt['num_examples']: break if world.epoch_done(): print("EPOCH DONE") if opt['compute_unique'] is True: unique_list = [] cntr = Counter(pred_list) for k, v in cntr.items(): if v == 1: unique_list.append(k) print("Unique responses: {:.{prec}f}%".format(len(unique_list) / len(pred_list) * 100, prec=2)) if opt['dump_predictions_path'] is not None: with open(opt['dump_predictions_path'], 'w') as f: f.writelines(['{}\n'.format(i) for i in pred_list]) if opt['compute_unique'] is True: with open(opt['dump_predictions_path'] + '_unique', 'w') as f: f.writelines(['{}\n'.format(i) for i in unique_list]) report = world.report() print(report) return report
def handle_message_helper( self, prefix_stripped_text: str) -> Optional[Dict[str, Metric]]: normalized = normalize_answer(prefix_stripped_text) if normalized in self.turns: self.repeated = True self.turns.append(normalized)