Example #1
0
 def compute(guess: str, answers: List[str]) -> Optional["InMetric"]:
     if guess is None or answers is None:
         return None
     guess = normalize_answer(guess)
     for a in answers:
         if normalize_answer(a) in guess:
             return InMetric(1)
     return InMetric(0)
Example #2
0
 def compute(guess: str, answers: List[str]):
     if guess is None or answers is None:
         return None
     guess = normalize_answer(guess)
     for a in answers:
         if guess in normalize_answer(a):
             return CopiedSubstringMetric(1)
     return CopiedSubstringMetric(0)
Example #3
0
 def _filter(freq_dist, cutoff: int, text: str) -> str:
     """
     For words that are found in the reference distribution, filters those with an
     occurrence count less than the cutoff.
     """
     words = normalize_answer(text).split()
     return " ".join([w for w in words if freq_dist.get(w, cutoff) < cutoff])
Example #4
0
 def __init__(self, corpus: str, top_p: float = 0.5):
     try:
         import nltk
     except ImportError:
         raise ImportError('Please install nltk (e.g. pip install nltk).')
     words = normalize_answer(corpus).split()
     self._freq_dist = nltk.FreqDist(words)
     self._cutoff_count = RareWordF1Calculator._find_cutoff_count(
         self._freq_dist, top_p)
Example #5
0
 def setup_data(self, fold):
     with PathManager.open(
             os.path.join(self.dpath,
                          self.fold + "_with_gold.json")) as json_file:
         gold_datas = json.load(json_file)["data"]
     for gold_data in gold_datas:
         text = gold_data["question"]
         label = gold_data["short_answers"][0]
         if self.opt.get("normalize_everything"):
             text = normalize_answer(text)
             label = normalize_answer(label)
         yield {
             'text': text,
             'label': label,
             'title': gold_data['title'],
             'checked_sentence': gold_data['context'],
             'answers': json.dumps(gold_data["short_answers"]),
         }, True
Example #6
0
 def process_prediction(prediction, word_statistics):
     word_statistics['pred_list'].append(normalize_answer(prediction))
     freqs, _cnt, wlength, clength = get_word_stats(prediction,
                                                    dictionary,
                                                    bins=bins)
     word_statistics['word_cnt'] += _cnt
     word_statistics['mean_wlength'].append(wlength)
     word_statistics['mean_clength'].append(clength)
     word_statistics['freqs_cnt'] += Counter(freqs)
     return word_statistics
Example #7
0
 def process_prediction(prediction, word_statistics):
     normalized = normalize_answer(prediction)
     word_statistics['pred_list'].append(normalized)
     freqs, _cnt, wlength, clength = get_word_stats(prediction,
                                                    dictionary,
                                                    bins=bins)
     word_statistics['word_cnt'] += _cnt
     word_statistics['mean_wlength'].append(wlength)
     word_statistics['mean_clength'].append(clength)
     word_statistics['freqs_cnt'] += Counter(freqs)
     word_statistics['unique_words'] |= set(normalized.split(" "))
     return word_statistics
 def handle_message_helper(
         self, prefix_stripped_text: str) -> Optional[Dict[str, Metric]]:
     here = [normalize_answer(x) for x in prefix_stripped_text.split(" ")]
     score = 1
     if len(self.turns) > 0:
         score = nltkbleu.corpus_bleu(
             [self.turns],
             [here],
             smoothing_function=nltkbleu.SmoothingFunction(
                 epsilon=1e-12).method1,
             weights=[1.0 / 3.0] * 3,
         )
     self.turns.append(here)
     return {self.metric_key(): BleuMetric(score)}
Example #9
0
    def _record_retrieval_metrics(self, batch: Batch,
                                  encoder_state: Tuple[Any, ...]):
        """
        Compute retrieval metrics, given retrieved documents.

        Only works when `--debug` is set.

        If there is knowledge in the Batch, we compute the following metrics:
        A) Doc Level:
        1. recall @ 1 --> is the correct document the first document?
        2. recall @ N --> is the correct document in the first N docs?

        B) Passage Level:
        1. recall @ 1 --> is the correct passage in the first document?
        2. recall @ N --> is the correct passage in the first N docs?

        Only works in debug mode.

        :param batch:
            training/eval batch
        :param encoder_state:
            encoder states from RagEncoder
        """
        if batch.valid_indices is None or batch.observations is None:
            return
        docs: List[List[Document]] = []
        _, _, input_turns_cnt, docs, _ = encoder_state
        if input_turns_cnt is not None:
            new_docs = []
            offset = 0
            for it in input_turns_cnt:
                docs_it = [dd for d in docs[offset:offset + it] for dd in d]
                new_docs.append(docs_it)
                offset += it
            docs = new_docs
        title_key = self.opt['gold_knowledge_title_key']
        passage_key = self.opt['gold_knowledge_passage_key']
        batchsize = len(batch.valid_indices)
        n_docs = self.opt['n_docs']
        metrics = {
            k: [0] * batchsize
            for k in [
                'doc_r@1',
                f'doc_r@{n_docs}',
                'passage_r@1',
                f'passage_r@{n_docs}',
                'title@1_f1',
                'passage@1_f1',
            ]
        }
        for i in range(batchsize):
            ex = batch.observations[i]
            label_title = normalize_answer(ex.get(title_key, ''))
            label_passage = normalize_answer(ex.get(passage_key, ''))

            for rank, doc in enumerate(docs[i]):
                model_title = normalize_answer(doc.get_title())
                model_passage = normalize_answer(doc.get_text())

                title_exact_match = model_title == label_title
                passage_match = (model_passage in label_passage
                                 or label_passage in model_passage)

                if rank == 0:
                    metrics['doc_r@1'][i] = int(title_exact_match)
                    metrics['passage_r@1'][i] = int(passage_match)
                    metrics['title@1_f1'][i] = F1Metric.compute(
                        guess=model_title, answers=[label_title]).value()
                    metrics['passage@1_f1'][i] = F1Metric.compute(
                        guess=model_passage, answers=[label_passage]).value()
                metrics[f'doc_r@{n_docs}'][i] = int(
                    metrics[f'doc_r@{n_docs}'][i] or title_exact_match)
                metrics[f'passage_r@{n_docs}'][i] = int(
                    metrics[f'passage_r@{n_docs}'][i] or passage_match)

        for m in metrics:
            self.record_local_metric(
                m, AverageMetric.many(metrics[m], [1] * batchsize))
Example #10
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ):
        """
        Custom Evaluations for Wizard of Wikipedia.

        When the label is `chosen_sent`, evaluate whether the model response...
        1) Is the correct document (title)
        2) _contains_ the correct chosen sentence (even if it's not wholly the answer)

        When the label is `response`, we compute F1 of model generation w.r.t checked sentence.

        :param teacher_action:
            The message last sent from this teacher.
        :param labels:
            The previous correct labels, if there were any.
        :param model_response:
            The raw response from the model. Generally you want to rely on the
            text field, but others may be necessary in specific situations.
        """
        if (self.label_type == 'response' and 'text' in model_response
                and 'checked_sentence' in teacher_action):
            self.metrics.add(
                'knowledge_f1',
                F1Metric.compute(model_response['text'],
                                 [teacher_action['checked_sentence']]),
            )
            if labels:
                self.metrics.add(
                    'rare_word_f1',
                    self.rare_word_f1.compute(model_response['text'], labels),
                )
        elif (self.label_type == 'chosen_sent'
              and TOKEN_KNOWLEDGE in model_response['text']):
            try:
                correct_title, correct_passage = [
                    normalize_answer(a)
                    for a in labels[0].split(TOKEN_KNOWLEDGE)
                ]
            except ValueError:
                # Knowledge not chosen
                correct_title, correct_passage = TOKEN_NOCHOSEN, TOKEN_NOCHOSEN
            title, passage = [
                normalize_answer(a)
                for a in model_response['text'].split(TOKEN_KNOWLEDGE)
            ]

            self.metrics.add('title_r@1',
                             AverageMetric(int(correct_title == title)))
            self.metrics.add('passage_r@1',
                             AverageMetric(int(correct_passage in passage)))
            if 'title_candidates' in model_response:
                title_candidates = [
                    normalize_answer(t)
                    for t in model_response['title_candidates']
                ][:5]
                self.metrics.add(
                    'title_r@5',
                    AverageMetric(
                        int(any(correct_title == t
                                for t in title_candidates))),
                )
            if 'text_candidates' in model_response:
                text_candidates = [
                    normalize_answer(t)
                    for t in model_response['text_candidates']
                ][:5]
                self.metrics.add(
                    'passage_r@5',
                    AverageMetric(
                        int(any(correct_passage in t
                                for t in text_candidates))),
                )
Example #11
0
def eval_wordstat(opt, print_parser=None):
    """Evaluates a model.

    Arguments:
    opt -- tells the evaluation function how to run
    print_parser -- if provided, prints the options that are set within the
        model after loading the model
    """
    random.seed(42)

    # Create model and assign it to the specified task
    agent = create_agent(opt, requireModelExists=True)
    world = create_task(opt, agent)

    if opt.get('external_dict'):
        print('[ Using external dictionary from: {} ]'.format(
            opt['external_dict']))
        dict_opt = copy.deepcopy(opt)
        dict_opt['dict_file'] = opt['external_dict']
        dictionary = DictionaryAgent(dict_opt)
    else:
        print('[ Using model bundled dictionary ]')
        dictionary = agent.dict

    if print_parser:
        # Show arguments after loading model
        print_parser.opt = agent.opt
        print_parser.print_args()
    log_every_n_secs = opt.get('log_every_n_secs', -1)
    if log_every_n_secs <= 0:
        log_every_n_secs = float('inf')
    log_time = TimeLogger()

    cnt = 0
    mean_wlength = []
    mean_clength = []
    freqs_cnt = Counter()
    word_cnt = 0
    bins = [int(i) for i in opt['freq_bins'].split(',')]
    pred_list = []

    while not world.epoch_done():
        cnt += 1
        world.parley()
        prediction = world.acts[-1]['text']
        pred_list.append(normalize_answer(prediction))
        freqs, _cnt, wlength, clength = get_word_stats(prediction,
                                                       dictionary,
                                                       bins=bins)
        word_cnt += _cnt

        mean_wlength.append(wlength)
        mean_clength.append(clength)

        freqs_cnt += Counter(freqs)

        if log_time.time() > log_every_n_secs or (
                opt['num_examples'] > 0
                and cnt >= opt['num_examples']) or world.epoch_done():
            report = world.report()
            text, report = log_time.log(report['exs'], world.num_examples(),
                                        report)
            print(text)
            stat_str = 'total_words: {}, '.format(word_cnt) + ', '.join([
                '<{}:{} ({:.{prec}f}%)'.format(
                    b,
                    freqs_cnt.get(b, 0),
                    (freqs_cnt.get(b, 0) / word_cnt) * 100,
                    prec=2) for b in bins
            ])
            print(
                "Word statistics: {}, avg_word_length: {:.{prec}f}, avg_char_length: {:.{prec}f}"
                .format(stat_str,
                        numpy.array(mean_wlength).mean(),
                        numpy.array(mean_clength).mean(),
                        prec=2))
        if opt['num_examples'] > 0 and cnt >= opt['num_examples']:
            break
    if world.epoch_done():
        print("EPOCH DONE")

    if opt['compute_unique'] is True:
        unique_list = []
        cntr = Counter(pred_list)
        for k, v in cntr.items():
            if v == 1:
                unique_list.append(k)
        print("Unique responses: {:.{prec}f}%".format(len(unique_list) /
                                                      len(pred_list) * 100,
                                                      prec=2))

    if opt['dump_predictions_path'] is not None:
        with open(opt['dump_predictions_path'], 'w') as f:
            f.writelines(['{}\n'.format(i) for i in pred_list])
        if opt['compute_unique'] is True:
            with open(opt['dump_predictions_path'] + '_unique', 'w') as f:
                f.writelines(['{}\n'.format(i) for i in unique_list])

    report = world.report()
    print(report)
    return report
 def handle_message_helper(
         self, prefix_stripped_text: str) -> Optional[Dict[str, Metric]]:
     normalized = normalize_answer(prefix_stripped_text)
     if normalized in self.turns:
         self.repeated = True
     self.turns.append(normalized)