Beispiel #1
0
 def report(self):
     """Report the metrics over all data seen so far."""
     m = {}
     total = self.metrics['cnt']
     m['exs'] = total
     if total > 0:
         if self.flags['print_prediction_metrics']:
             if 'accuracy' in self.metrics_list:
                 m['accuracy'] = round_sigfigs(
                     self.metrics['correct'] /
                     max(1, self.metrics['correct_cnt']), 4)
             if 'f1' in self.metrics_list:
                 m['f1'] = round_sigfigs(
                     self.metrics['f1'] / max(1, self.metrics['f1_cnt']), 4)
         if self.flags['has_text_cands']:
             for k in self.eval_pr:
                 m['hits@' + str(k)] = round_sigfigs(
                     self.metrics['hits@' + str(k)] /
                     max(1, self.metrics['hits@_cnt']),
                     3,
                 )
         for k in self.metrics_list:
             if self.metrics[k +
                             '_cnt'] > 0 and k != 'correct' and k != 'f1':
                 m[k] = round_sigfigs(
                     self.metrics[k] / max(1, self.metrics[k + '_cnt']), 4)
     return m
Beispiel #2
0
 def report(self):
     """Report per-dialogue round metrics."""
     m = {k: {} for k in ["first_round", "second_round", "third_round+"]}
     for k, v in self.metrics.items():
         if v["num_samples"] > 0:
             m[k]["hits@1/100"] = round_sigfigs(
                 v["hits@1/100"] / v["num_samples"], 4)
             m[k]["loss"] = round_sigfigs(v["loss"] / v["num_samples"], 4)
             if "med_rank" in v:
                 m[k]["med_rank"] = np.median(v["med_rank"])
     return m
Beispiel #3
0
 def report(self):
     m = {}
     with self._lock():
         m['exs'] = self.metrics['exs']
         if m['exs'] > 0:
             # m['num_unk'] = self.metrics['num_unk']
             # m['num_tokens'] = self.metrics['num_tokens']
             m['loss'] = round_sigfigs(
                 self.metrics['loss'] / self.metrics['num_tokens'], 3)
             m['ppl'] = round_sigfigs(
                 math.exp(self.metrics['loss'] /
                          self.metrics['num_tokens']), 4)
     return m
Beispiel #4
0
    def test_round_sigfigs(self):
        x = 0
        y = 0
        assert round_sigfigs(x, 2) == y

        x = 100
        y = 100
        assert round_sigfigs(x, 2) == y

        x = 0.01
        y = 0.01
        assert round_sigfigs(x, 2) == y

        x = 0.00123
        y = 0.001
        assert round_sigfigs(x, 1) == y

        x = 0.37
        y = 0.4
        assert round_sigfigs(x, 1) == y

        x = 2353
        y = 2350
        assert round_sigfigs(x, 3) == y

        x = 3547345734
        y = 3547350000
        assert round_sigfigs(x, 6) == y

        x = 0.0000046246
        y = 0.00000462
        assert round_sigfigs(x, 3) == y
    def report(self):
        """
        Report loss and perplexity from model's perspective.

        Note that this includes predicting __END__ and __UNK__ tokens and may differ
        from a truly independent measurement.

        Additionally report tokenized bleu scores, if desired.
        """
        base = super().report()
        m = {}
        num_tok = self.metrics['num_tokens']
        if num_tok > 0:
            m['loss'] = self.metrics['loss']
            if self.metrics['correct_tokens'] > 0:
                m['token_acc'] = self.metrics['correct_tokens'] / num_tok
            m['nll_loss'] = self.metrics['nll_loss'] / num_tok
            try:
                m['ppl'] = math.exp(m['nll_loss'])
            except OverflowError:
                m['ppl'] = float('inf')
        if self.metrics['total_skipped_batches'] > 0:
            m['total_skipped_batches'] = self.metrics['total_skipped_batches']
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            base[k] = round_sigfigs(v, 4)
        if not self.skip_generation and self.compute_tokenized_bleu:
            base.update({'fairseq_bleu': 'N/A', 'nltk_bleu_unnormalized': 'N/A'})
            if fairseq_bleu is not None:
                try:
                    fairseq_bleu_scores = {
                        k: self.fairseq_bleu_scorer.result_string(order=k)
                        for k in range(1, 5)
                    }
                except ZeroDivisionError:
                    # some preds are REAL bad
                    fairseq_bleu_scores = {k: '= 0,' for k in range(1, 5)}

                base['fairseq_bleu'] = {
                    k: float(v[v.index('= ') + 2 : v.index(',')])
                    for k, v in fairseq_bleu_scores.items()
                }
            if nltkbleu is not None:
                base['nltk_bleu_unnormalized'] = {
                    k: round_sigfigs(v['score'] / v['cnt'], 4)
                    for k, v in self.nltk_bleu_scores.items()
                }
        return base
Beispiel #6
0
    def report(self):
        """Report loss as well as precision, recall, and F1 metrics."""
        m = super().report()
        examples = self.metrics['examples']
        if examples > 0:
            m['examples'] = examples
            m['mean_loss'] = self.metrics['loss'] / examples

            # get prec/recall metrics
            confmat = self.metrics['confusion_matrix']
            if self.opt.get('get_all_metrics'):
                metrics_list = self.class_list
            else:
                # only give prec/recall metrics for ref class
                metrics_list = [self.ref_class]

            examples_per_class = []
            for class_i in metrics_list:
                class_total = self._report_prec_recall_metrics(
                    confmat, class_i, m)
                examples_per_class.append(class_total)

            if len(examples_per_class) > 1:
                # get weighted f1
                f1 = 0
                total_exs = sum(examples_per_class)
                for i in range(len(self.class_list)):
                    f1 += (examples_per_class[i] / total_exs
                           ) * m['class_{}_f1'.format(self.class_list[i])]
                m['weighted_f1'] = f1

        for k, v in m.items():
            m[k] = round_sigfigs(v, 4)

        return m
    def report(self):
        """
        Report the current metrics.

        :return:
            a metrics dict
        """
        m = {}
        if self.metrics['num_samples'] > 0:
            m['hits@1/100'] = round_sigfigs(
                self.metrics['hits@1/100'] / self.metrics['num_samples'], 4)
            m['loss'] = round_sigfigs(
                self.metrics['loss'] / self.metrics['num_samples'], 4)
            if 'med_rank' in self.metrics:
                m['med_rank'] = np.median(self.metrics['med_rank'])
        return m
 def report(self):
     r = super().report()
     bsz = max(self.metrics['bsz'], 1)
     for k in ['know_loss', 'know_acc', 'know_chance']:
         # round and average across all items since last report
         r[k] = round_sigfigs(self.metrics[k] / bsz, 4)
     return r
Beispiel #9
0
    def report(self):
        """
        Report loss and perplexity from model's perspective.

        Note that this includes predicting __END__ and __UNK__ tokens and may
        differ from a truly independent measurement.
        """
        base = super().report()
        m = {}
        num_tok = self.metrics['num_tokens']
        if num_tok > 0:
            m['loss'] = self.metrics['loss']
            if self.metrics['correct_tokens'] > 0:
                m['token_acc'] = self.metrics['correct_tokens'] / num_tok
            m['nll_loss'] = self.metrics['nll_loss'] / num_tok
            try:
                m['ppl'] = math.exp(m['nll_loss'])
            except OverflowError:
                m['ppl'] = float('inf')
        if self.metrics['total_skipped_batches'] > 0:
            m['total_skipped_batches'] = self.metrics['total_skipped_batches']
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            base[k] = round_sigfigs(v, 4)
        return base
Beispiel #10
0
    def report(self):
        """
        Return metrics calculated by the model.
        """
        # if we haven't initialized yet, just return a dummy object
        if not hasattr(self, "trainer"):
            return {}

        output = {k: v.avg for k, v in self.meters.items()}

        if "nll_loss" in self.meters:
            # special case, we used sentence averaging so ppl comes from nll_loss
            output["ppl"] = np.exp2(self.meters["nll_loss"].avg)
        else:
            # normal case, just use loss
            output["ppl"] = np.exp2(self.meters["loss"].avg)

        # Fairseq trainer metrics we'll pass up the way
        trainer_metrics = {"ups", "wps", "gnorm", "clip"}
        if self.is_training:
            for k in trainer_metrics:
                output[k] = self.trainer.meters[k].avg

        # for display purposes
        output = {k: round_sigfigs(v, 4) for k, v in output.items()}
        return output
Beispiel #11
0
 def _format_interactive_output(self, probs, prediction_id):
     """Format interactive mode output with scores."""
     preds = []
     for i, pred_id in enumerate(prediction_id.tolist()):
         prob = round_sigfigs(probs[i][pred_id], 4)
         preds.append('Predicted class: {}\nwith probability: {}'.format(
             self.class_list[pred_id], prob))
     return preds
Beispiel #12
0
 def report(self):
     """
     Report per-dialogue round metrics.
     """
     m = {}
     for k, v in self.metrics.items():
         if "num_samples" not in v:
             print(self.metrics)
             print(k)
             __import__("ipdb").set_trace()  # FIXME
         if v["num_samples"] > 0:
             m[f"{k}/hits@1/100"] = round_sigfigs(
                 v["hits@1/100"] / v["num_samples"], 4)
             m[f"{k}/loss"] = round_sigfigs(v["loss"] / v["num_samples"], 4)
             if "med_rank" in v:
                 m[f"{k}/med_rank"] = np.median(v["med_rank"])
     return m
Beispiel #13
0
 def report(self):
     r = super().report()
     bsz = max(self.metrics['bsz'], 1)
     for k in [
             'intent_acc', 'flight_acc', 'name_acc', 'status_acc',
             'intent_loss', 'flight_loss', 'name_loss'
     ]:
         r[k] = round_sigfigs(self.metrics[k] / bsz, 4)
     return r
Beispiel #14
0
def aggregate_task_reports(reports, tasks, micro=False):
    """
    Aggregate separate task reports into a single report.

    :param reports: list of report dicts from separate tasks
    :param tasks: list of tasks
    :param micro: average per example if True, else average over t

    :return: aggregated report dicts
    """
    if len(reports) == 1:
        # singular task
        return reports[0]
    # multiple tasks, aggregate metrics
    metrics = {}
    exs = {}
    total_report = {'tasks': {}}
    # collect metrics from all reports
    for i, report in enumerate(reports):
        total_report['tasks'][tasks[i]] = report
        for metric, val in report.items():
            if metric == 'exs':
                exs[tasks[i]] = val
            else:
                metrics.setdefault(metric, {})[tasks[i]] = val
    # now aggregate
    total_exs = sum(exs.values())
    total_report['exs'] = total_exs
    for metric, task_vals in metrics.items():
        if all([isinstance(v, Number) for v in task_vals.values()]):
            if micro:
                # average over the number of examples
                vals = [task_vals[task] * exs[task] for task in tasks]
                total_report[metric] = round_sigfigs(sum(vals) / total_exs, 4)
            else:  # macro
                # average over tasks
                vals = task_vals.values()
                total_report[metric] = round_sigfigs(sum(vals) / len(vals), 4)
    # add a warning describing how metrics were averaged across tasks.
    total_report['warning'] = 'metrics are averaged across tasks'
    if micro:
        total_report[
            'warning'] += ' and weighted by the number of examples ' 'per task'
    return total_report
Beispiel #15
0
 def _nice_format(self, dictionary):
     rounded = {}
     for k, v in dictionary.items():
         if isinstance(v, dict):
             rounded[k] = self._nice_format(v)
         elif isinstance(v, float):
             rounded[k] = round_sigfigs(v, 4)
         else:
             rounded[k] = v
     return rounded
Beispiel #16
0
 def report(self):
     m = {}
     if self.metrics['num_tokens'] > 0:
         m['loss'] = self.metrics['loss'] / self.metrics['num_tokens']
         m['ppl'] = math.exp(m['loss'])
     if self.metrics['lm_num_tokens'] > 0:
         m['lmloss'] = self.metrics['lmloss'] / self.metrics['lm_num_tokens']
         m['lmppl'] = math.exp(m['lmloss'])
     for k, v in m.items():
         # clean up: rounds to sigfigs and converts tensors to floats
         m[k] = round_sigfigs(v, 4)
     return m
Beispiel #17
0
 def report(self):
     """
     Report metrics from model's perspective.
     """
     m = TorchAgent.report(self)  # Skip TorchRankerAgent; totally redundant
     examples = self.metrics['examples']
     if examples > 0:
         m['examples'] = examples
         if 'dialog' in self.subtasks and self.metrics['dia_exs'] > 0:
             m['dia_loss'] = self.metrics['dia_loss'] / self.metrics[
                 'dia_exs']
             m['dia_rank'] = self.metrics['dia_rank'] / self.metrics[
                 'dia_exs']
             m['dia_acc'] = self.metrics['dia_correct'] / self.metrics[
                 'dia_exs']
             m['dia_exs'] = self.metrics['dia_exs']
         if 'feedback' in self.subtasks and self.metrics['fee_exs'] > 0:
             m['fee_loss'] = self.metrics['fee_loss'] / self.metrics[
                 'fee_exs']
             m['fee_rank'] = self.metrics['fee_rank'] / self.metrics[
                 'fee_exs']
             m['fee_acc'] = self.metrics['fee_correct'] / self.metrics[
                 'fee_exs']
             m['fee_exs'] = self.metrics['fee_exs']
             m['fee_exs'] = self.metrics['fee_exs']
         if 'satisfaction' in self.subtasks and self.metrics['sat_exs'] > 0:
             tp = self.metrics['sat_tp']
             tn = self.metrics['sat_tn']
             fp = self.metrics['sat_fp']
             fn = self.metrics['sat_fn']
             assert tp + tn + fp + fn == self.metrics['sat_exs']
             m['sat_loss'] = self.metrics['sat_loss'] / self.metrics[
                 'sat_exs']
             m['sat_pr'] = tp / (tp + fp + EPS)
             m['sat_re'] = tp / (tp + fn + EPS)
             pr = m['sat_pr']
             re = m['sat_re']
             m['sat_f1'] = (2 * pr * re) / (pr + re) if (pr and re) else 0.0
             m['sat_acc'] = (tp + tn) / self.metrics['sat_exs']
             m['sat_exs'] = self.metrics['sat_exs']
     for k, v in m.items():
         # clean up: rounds to sigfigs and converts tensors to floats
         if isinstance(v, float):
             m[k] = round_sigfigs(v, 4)
         else:
             m[k] = v
     return m
Beispiel #18
0
def aggregate_metrics(reporters):
    """
    Aggregate metrics from multiple reports.
    """
    # reporters is a list of teachers or worlds
    m = {}
    m['tasks'] = {}
    sums = {}
    num_tasks = 0
    total = 0
    for i in range(len(reporters)):
        task_id = reporters[i].getID()
        task_report = reporters[i].report()
        for each_metric, value in task_report.items():
            if isinstance(value, float):
                sums[each_metric] = 0.0
                m[each_metric] = 0.0
            elif isinstance(value, Number):
                sums[each_metric] = 0
                m[each_metric] = 0

    for i in range(len(reporters)):
        task_id = reporters[i].getID()
        task_report = reporters[i].report()
        while task_id in m['tasks']:
            # prevent name clobbering if using multiple tasks with same ID
            task_id += '_'
        m['tasks'][task_id] = task_report
        total += task_report.get('exs', 0)
        found_any = False
        for k in sums.keys():
            if k in task_report:
                sums[k] += task_report[k]
                found_any = True
        if found_any:
            num_tasks += 1
    m['exs'] = total
    m['accuracy'] = 0
    if num_tasks > 0:
        for k in sums.keys():
            m[k] = round_sigfigs(sums[k] / num_tasks, 4)
    return m
Beispiel #19
0
 def report(self):
     """Report loss and mean_rank from model's perspective."""
     base = super().report()
     m = {}
     examples = self.metrics['examples']
     if examples > 0:
         m['examples'] = examples
         m['loss'] = self.metrics['loss']
         m['mean_loss'] = self.metrics['loss'] / examples
         batch_train = self.candidates == 'batch' and self.is_training
         if not self.is_training or self.opt.get(
                 'train_predict') or batch_train:
             m['mean_rank'] = self.metrics['rank'] / examples
             m['mrr'] = self.metrics['mrr'] / examples
         if batch_train:
             m['train_accuracy'] = self.metrics['train_accuracy'] / examples
     for k, v in m.items():
         # clean up: rounds to sigfigs and converts tensors to floats
         base[k] = round_sigfigs(v, 4)
     return base
Beispiel #20
0
    def report(self):
        """
        Report loss and perplexity from model's perspective.

        Note that this includes predicting __END__ and __UNK__ tokens and may
        differ from a truly independent measurement.
        """
        m = {}
        m["num_tokens"] = self.counts["num_tokens"]
        # m["num_batches"] = self.counts["num_batches"]
        # m["loss"] = self.metrics["loss"] / m["num_batches"]
        # m["base_loss"] = self.metrics["base_loss"] / m["num_batches"]
        m["acc"] = self.metrics["acc"] / m["num_tokens"]
        m["auc"] = self.metrics["auc"] / m["num_tokens"]
        # Top-k recommendation Recall
        for x in sorted(self.metrics):
            if x.startswith("recall") and self.counts[x] > 200:
                m[x] = self.metrics[x] / self.counts[x]
                m["num_tokens_" + x] = self.counts[x]
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            m[k] = round_sigfigs(v, 4)
        return m
    def report(self):
        base = super().report()
        m = dict()

        if self.metrics['loss_G_cnt'] > 0:
            m['loss_G'] = self.metrics['loss_G'] / self.metrics['loss_G_cnt']
        if self.metrics['loss_D_cnt'] > 0:
            m['loss_D'] = self.metrics['loss_D'] / self.metrics['loss_D_cnt']

        if self.metrics['kl_loss_cnt'] > 0:
            m['kl_loss'] = self.metrics['kl_loss'] / self.metrics['kl_loss_cnt']

        if self.metrics['bow_loss_cnt'] > 0:
            m['bow_loss'] = self.metrics['bow_loss'] / self.metrics[
                'bow_loss_cnt']

        if 'loss_G' in m and 'loss_D' in m:
            m['to_minimize'] = m['loss_G'] + m['loss_D']

        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            base[k] = round_sigfigs(v, 4)
        return base
Beispiel #22
0
    def report(self):
        base = super().report()
        m = dict()

        if self.metrics['total_unigram_cnt'] > 0:
            m['dist_1_cnt'] = len(self.metrics['dist_unigram_tokens'])
            m['dist_1_ratio'] = m['dist_1_cnt'] / self.metrics[
                'total_unigram_cnt']

        if self.metrics['total_bigram_cnt'] > 0:
            m['dist_2_cnt'] = len(self.metrics['dist_bigram_tokens'])
            m['dist_2_ratio'] = m['dist_2_cnt'] / self.metrics[
                'total_bigram_cnt']

        if self.metrics['total_trigram_cnt'] > 0:
            m['dist_3_cnt'] = len(self.metrics['dist_trigram_tokens'])
            m['dist_3_ratio'] = m['dist_3_cnt'] / self.metrics[
                'total_trigram_cnt']

        if self.metrics['intra_unigram_cnt'] > 0:
            m['intra_dist_1'] = self.metrics['intra_unigram'] / self.metrics[
                'intra_unigram_cnt']

        if self.metrics['intra_bigram_cnt'] > 0:
            m['intra_dist_2'] = self.metrics['intra_bigram'] / self.metrics[
                'intra_bigram_cnt']

        if self.metrics['intra_trigram_cnt'] > 0:
            m['intra_dist_3'] = self.metrics['intra_trigram'] / self.metrics[
                'intra_trigram_cnt']

        if self.metrics['response_length_cnt'] > 0:
            m['response_length'] = self.metrics[
                'response_length'] / self.metrics['response_length_cnt']

        if self.metrics['embed_avg_cnt'] > 0:
            m['embed_avg'] = self.metrics['embed_avg'] / self.metrics[
                'embed_avg_cnt']
        if self.metrics['embed_extrema_cnt'] > 0:
            m['embed_extrema'] = self.metrics['embed_extrema'] / self.metrics[
                'embed_extrema_cnt']
        if self.metrics['embed_greedy_cnt'] > 0:
            m['embed_greedy'] = self.metrics['embed_greedy'] / self.metrics[
                'embed_greedy_cnt']
        if self.metrics['embed_coh_cnt'] > 0:
            m['embed_coh'] = self.metrics['embed_coh'] / self.metrics[
                'embed_coh_cnt']

        # Entropy
        if self.metrics['sent_entropy_uni_cnt'] > 0:
            m['sent_entropy_uni'] = self.metrics[
                'sent_entropy_uni'] / self.metrics['sent_entropy_uni_cnt']
        if self.metrics['sent_entropy_bi_cnt'] > 0:
            m['sent_entropy_bi'] = self.metrics[
                'sent_entropy_bi'] / self.metrics['sent_entropy_bi_cnt']
        if self.metrics['sent_entropy_tri_cnt'] > 0:
            m['sent_entropy_tri'] = self.metrics[
                'sent_entropy_tri'] / self.metrics['sent_entropy_tri_cnt']
        if self.metrics['word_entropy_uni_cnt'] > 0:
            m['word_entropy_uni'] = self.metrics[
                'word_entropy_uni'] / self.metrics['word_entropy_uni_cnt']
        if self.metrics['word_entropy_bi_cnt'] > 0:
            m['word_entropy_bi'] = self.metrics[
                'word_entropy_bi'] / self.metrics['word_entropy_bi_cnt']
        if self.metrics['word_entropy_tri_cnt'] > 0:
            m['word_entropy_tri'] = self.metrics[
                'word_entropy_tri'] / self.metrics['word_entropy_tri_cnt']

        # -- Ground-truth metrics
        if self.metrics['human_total_unigram_cnt'] > 0:
            m['human_dist_1_cnt'] = len(
                self.metrics['human_dist_unigram_tokens'])
            m['human_dist_1_ratio'] = m['human_dist_1_cnt'] / self.metrics[
                'human_total_unigram_cnt']

        if self.metrics['human_total_bigram_cnt'] > 0:
            m['human_dist_2_cnt'] = len(
                self.metrics['human_dist_bigram_tokens'])
            m['human_dist_2_ratio'] = m['human_dist_2_cnt'] / self.metrics[
                'human_total_bigram_cnt']

        if self.metrics['human_total_trigram_cnt'] > 0:
            m['human_dist_3_cnt'] = len(
                self.metrics['human_dist_trigram_tokens'])
            m['human_dist_3_ratio'] = m['human_dist_3_cnt'] / self.metrics[
                'human_total_trigram_cnt']

        if self.metrics['human_intra_unigram_cnt'] > 0:
            m['human_intra_dist_1'] = self.metrics[
                'human_intra_unigram'] / self.metrics['human_intra_unigram_cnt']

        if self.metrics['human_intra_bigram_cnt'] > 0:
            m['human_intra_dist_2'] = self.metrics[
                'human_intra_bigram'] / self.metrics['human_intra_bigram_cnt']

        if self.metrics['human_intra_trigram_cnt'] > 0:
            m['human_intra_dist_3'] = self.metrics[
                'human_intra_trigram'] / self.metrics['human_intra_trigram_cnt']

        if self.metrics['human_response_length_cnt'] > 0:
            m['human_response_length'] = self.metrics[
                'human_response_length'] / self.metrics[
                    'human_response_length_cnt']

        if self.metrics['human_embed_coh_cnt'] > 0:
            m['human_embed_coh'] = self.metrics[
                'human_embed_coh'] / self.metrics['human_embed_coh_cnt']

        if self.metrics['human_sent_entropy_uni_cnt'] > 0:
            m['human_sent_entropy_uni'] = self.metrics[
                'human_sent_entropy_uni'] / self.metrics[
                    'human_sent_entropy_uni_cnt']
        if self.metrics['human_sent_entropy_bi_cnt'] > 0:
            m['human_sent_entropy_bi'] = self.metrics[
                'human_sent_entropy_bi'] / self.metrics[
                    'human_sent_entropy_bi_cnt']
        if self.metrics['human_sent_entropy_tri_cnt'] > 0:
            m['human_sent_entropy_tri'] = self.metrics[
                'human_sent_entropy_tri'] / self.metrics[
                    'human_sent_entropy_tri_cnt']
        if self.metrics['human_word_entropy_uni_cnt'] > 0:
            m['human_word_entropy_uni'] = self.metrics[
                'human_word_entropy_uni'] / self.metrics[
                    'human_word_entropy_uni_cnt']
        if self.metrics['human_word_entropy_bi_cnt'] > 0:
            m['human_word_entropy_bi'] = self.metrics[
                'human_word_entropy_bi'] / self.metrics[
                    'human_word_entropy_bi_cnt']
        if self.metrics['human_word_entropy_tri_cnt'] > 0:
            m['human_word_entropy_tri'] = self.metrics[
                'human_word_entropy_tri'] / self.metrics[
                    'human_word_entropy_tri_cnt']

        if not self.model.training:
            # TODO: add other metrics and balance these metrics
            m['total_metric'] = \
                (-base.get('ppl' if 'ppl' not in self.metrics_exclude_from_total_metric else 'NON_EXIST', 0) * 0.25) / 100 + \
                (m.get('dist_1_ratio', 0) + m.get('dist_2_ratio', 0) + clip_value(m.get('dist_3_ratio', 0), 0.001)) + \
                (m.get('embed_avg', 0) + m.get('embed_greedy', 0) + m.get('embed_extrema', 0) + m.get('embed_coh', 0)) + \
                (m.get('intra_dist_1', 0) + m.get('intra_dist_2', 0) + m.get('intra_dist_3', 0)) / 10 + \
                (m.get('word_entropy_uni', 0) + m.get('word_entropy_bi', 0) + m.get('word_entropy_tri', 0)) / 50 + \
                (m.get('response_length' if 'response_length' not in self.metrics_exclude_from_total_metric else 'NON_EXIST',
                       0)) / self.max_response_len

            # sent_entropy is strong correlated with word_entropy, so we only compute one of them
            # m.get('sent_entropy_uni', 0) + m.get('sent_entropy_bi', 0) + m.get('sent_entropy_tri', 0)

        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            base[k] = round_sigfigs(v, 5)
        return base
Beispiel #23
0
def eval_ppl(opt, build_dict=None, dict_file=None):
    """
    Evaluates the the perplexity of a model.

    This uses a dictionary which implements the following functions:
    - tokenize(text): splits string up into list of tokens
    - __in__(text): checks whether dictionary contains a token
    - keys(): returns an iterator over all tokens in the dictionary

    :param opt: option dict
    :param build_dict: function which returns a dictionary class implementing
        the functions above.
    :param dict_file: file used when loading the dictionary class set via the
        "dictionary_class" argument (defaults to
        parlai.core.dict:DictionaryAgent).

    Either build_dict or dict_file must be set (both default to None) to
    determine the dictionary used for the evaluation.
    """
    if not build_dict and not dict_file:
        raise RuntimeError('eval_ppl script either needs a dictionary build '
                           'function or a dictionary file.')

    if build_dict:
        dict_agent = build_dict()
    else:
        dict_opt = copy.deepcopy(opt)
        dict_opt['model'] = dict_opt.get('dictionary_class',
                                         'parlai.core.dict:DictionaryAgent')
        dict_opt['model_file'] = dict_file
        if 'override' in dict_opt:
            del dict_opt['override']
        dict_agent = create_agent(dict_opt, requireModelExists=True)

    # create agents
    agent = create_agent(opt)
    world = create_task(opt, [agent, dict_agent],
                        default_world=PerplexityWorld)

    # set up logging
    log_time = Timer()
    tot_time = 0

    while not world.epoch_done():
        world.parley()  # process an example

        if log_time.time() > 1:  # log every 1 sec
            tot_time += log_time.time()
            report = world.report()
            print('{}s elapsed, {}%% complete, {}'.format(
                int(tot_time),
                round_sigfigs(report['exs'] / world.num_examples() * 100, 3),
                report,
            ))
            log_time.reset()
    print('EPOCH DONE')
    tot_time += log_time.time()
    final_report = world.report()
    print('{}s elapsed: {}'.format(int(tot_time), final_report))
    print("============================")
    print("FINAL PPL: " + str(final_report['ppl']))
    if final_report.get('ppl', 0) == float('inf'):
        print('Note: you got inf perplexity. Consider adding (or raising) the '
              'minimum probability you assign to each possible word. If you '
              'assign zero probability to the correct token in the evaluation '
              'vocabulary, you get inf probability immediately.')
Beispiel #24
0
 def clip(f):
     return round_sigfigs(f)
Beispiel #25
0
 def _get_bins(self, counts: Counter):
     c = Counter()
     for k, v in counts.items():
         c.update({self.truebins.get(k, 'never'): v})
     t = sum(c.values())
     return {k: round_sigfigs(v / t, 4) for k, v in c.items()}