def report(self): """Report the metrics over all data seen so far.""" m = {} total = self.metrics['cnt'] m['exs'] = total if total > 0: if self.flags['print_prediction_metrics']: if 'accuracy' in self.metrics_list: m['accuracy'] = round_sigfigs( self.metrics['correct'] / max(1, self.metrics['correct_cnt']), 4) if 'f1' in self.metrics_list: m['f1'] = round_sigfigs( self.metrics['f1'] / max(1, self.metrics['f1_cnt']), 4) if self.flags['has_text_cands']: for k in self.eval_pr: m['hits@' + str(k)] = round_sigfigs( self.metrics['hits@' + str(k)] / max(1, self.metrics['hits@_cnt']), 3, ) for k in self.metrics_list: if self.metrics[k + '_cnt'] > 0 and k != 'correct' and k != 'f1': m[k] = round_sigfigs( self.metrics[k] / max(1, self.metrics[k + '_cnt']), 4) return m
def report(self): """Report per-dialogue round metrics.""" m = {k: {} for k in ["first_round", "second_round", "third_round+"]} for k, v in self.metrics.items(): if v["num_samples"] > 0: m[k]["hits@1/100"] = round_sigfigs( v["hits@1/100"] / v["num_samples"], 4) m[k]["loss"] = round_sigfigs(v["loss"] / v["num_samples"], 4) if "med_rank" in v: m[k]["med_rank"] = np.median(v["med_rank"]) return m
def report(self): m = {} with self._lock(): m['exs'] = self.metrics['exs'] if m['exs'] > 0: # m['num_unk'] = self.metrics['num_unk'] # m['num_tokens'] = self.metrics['num_tokens'] m['loss'] = round_sigfigs( self.metrics['loss'] / self.metrics['num_tokens'], 3) m['ppl'] = round_sigfigs( math.exp(self.metrics['loss'] / self.metrics['num_tokens']), 4) return m
def test_round_sigfigs(self): x = 0 y = 0 assert round_sigfigs(x, 2) == y x = 100 y = 100 assert round_sigfigs(x, 2) == y x = 0.01 y = 0.01 assert round_sigfigs(x, 2) == y x = 0.00123 y = 0.001 assert round_sigfigs(x, 1) == y x = 0.37 y = 0.4 assert round_sigfigs(x, 1) == y x = 2353 y = 2350 assert round_sigfigs(x, 3) == y x = 3547345734 y = 3547350000 assert round_sigfigs(x, 6) == y x = 0.0000046246 y = 0.00000462 assert round_sigfigs(x, 3) == y
def report(self): """ Report loss and perplexity from model's perspective. Note that this includes predicting __END__ and __UNK__ tokens and may differ from a truly independent measurement. Additionally report tokenized bleu scores, if desired. """ base = super().report() m = {} num_tok = self.metrics['num_tokens'] if num_tok > 0: m['loss'] = self.metrics['loss'] if self.metrics['correct_tokens'] > 0: m['token_acc'] = self.metrics['correct_tokens'] / num_tok m['nll_loss'] = self.metrics['nll_loss'] / num_tok try: m['ppl'] = math.exp(m['nll_loss']) except OverflowError: m['ppl'] = float('inf') if self.metrics['total_skipped_batches'] > 0: m['total_skipped_batches'] = self.metrics['total_skipped_batches'] for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats base[k] = round_sigfigs(v, 4) if not self.skip_generation and self.compute_tokenized_bleu: base.update({'fairseq_bleu': 'N/A', 'nltk_bleu_unnormalized': 'N/A'}) if fairseq_bleu is not None: try: fairseq_bleu_scores = { k: self.fairseq_bleu_scorer.result_string(order=k) for k in range(1, 5) } except ZeroDivisionError: # some preds are REAL bad fairseq_bleu_scores = {k: '= 0,' for k in range(1, 5)} base['fairseq_bleu'] = { k: float(v[v.index('= ') + 2 : v.index(',')]) for k, v in fairseq_bleu_scores.items() } if nltkbleu is not None: base['nltk_bleu_unnormalized'] = { k: round_sigfigs(v['score'] / v['cnt'], 4) for k, v in self.nltk_bleu_scores.items() } return base
def report(self): """Report loss as well as precision, recall, and F1 metrics.""" m = super().report() examples = self.metrics['examples'] if examples > 0: m['examples'] = examples m['mean_loss'] = self.metrics['loss'] / examples # get prec/recall metrics confmat = self.metrics['confusion_matrix'] if self.opt.get('get_all_metrics'): metrics_list = self.class_list else: # only give prec/recall metrics for ref class metrics_list = [self.ref_class] examples_per_class = [] for class_i in metrics_list: class_total = self._report_prec_recall_metrics( confmat, class_i, m) examples_per_class.append(class_total) if len(examples_per_class) > 1: # get weighted f1 f1 = 0 total_exs = sum(examples_per_class) for i in range(len(self.class_list)): f1 += (examples_per_class[i] / total_exs ) * m['class_{}_f1'.format(self.class_list[i])] m['weighted_f1'] = f1 for k, v in m.items(): m[k] = round_sigfigs(v, 4) return m
def report(self): """ Report the current metrics. :return: a metrics dict """ m = {} if self.metrics['num_samples'] > 0: m['hits@1/100'] = round_sigfigs( self.metrics['hits@1/100'] / self.metrics['num_samples'], 4) m['loss'] = round_sigfigs( self.metrics['loss'] / self.metrics['num_samples'], 4) if 'med_rank' in self.metrics: m['med_rank'] = np.median(self.metrics['med_rank']) return m
def report(self): r = super().report() bsz = max(self.metrics['bsz'], 1) for k in ['know_loss', 'know_acc', 'know_chance']: # round and average across all items since last report r[k] = round_sigfigs(self.metrics[k] / bsz, 4) return r
def report(self): """ Report loss and perplexity from model's perspective. Note that this includes predicting __END__ and __UNK__ tokens and may differ from a truly independent measurement. """ base = super().report() m = {} num_tok = self.metrics['num_tokens'] if num_tok > 0: m['loss'] = self.metrics['loss'] if self.metrics['correct_tokens'] > 0: m['token_acc'] = self.metrics['correct_tokens'] / num_tok m['nll_loss'] = self.metrics['nll_loss'] / num_tok try: m['ppl'] = math.exp(m['nll_loss']) except OverflowError: m['ppl'] = float('inf') if self.metrics['total_skipped_batches'] > 0: m['total_skipped_batches'] = self.metrics['total_skipped_batches'] for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats base[k] = round_sigfigs(v, 4) return base
def report(self): """ Return metrics calculated by the model. """ # if we haven't initialized yet, just return a dummy object if not hasattr(self, "trainer"): return {} output = {k: v.avg for k, v in self.meters.items()} if "nll_loss" in self.meters: # special case, we used sentence averaging so ppl comes from nll_loss output["ppl"] = np.exp2(self.meters["nll_loss"].avg) else: # normal case, just use loss output["ppl"] = np.exp2(self.meters["loss"].avg) # Fairseq trainer metrics we'll pass up the way trainer_metrics = {"ups", "wps", "gnorm", "clip"} if self.is_training: for k in trainer_metrics: output[k] = self.trainer.meters[k].avg # for display purposes output = {k: round_sigfigs(v, 4) for k, v in output.items()} return output
def _format_interactive_output(self, probs, prediction_id): """Format interactive mode output with scores.""" preds = [] for i, pred_id in enumerate(prediction_id.tolist()): prob = round_sigfigs(probs[i][pred_id], 4) preds.append('Predicted class: {}\nwith probability: {}'.format( self.class_list[pred_id], prob)) return preds
def report(self): """ Report per-dialogue round metrics. """ m = {} for k, v in self.metrics.items(): if "num_samples" not in v: print(self.metrics) print(k) __import__("ipdb").set_trace() # FIXME if v["num_samples"] > 0: m[f"{k}/hits@1/100"] = round_sigfigs( v["hits@1/100"] / v["num_samples"], 4) m[f"{k}/loss"] = round_sigfigs(v["loss"] / v["num_samples"], 4) if "med_rank" in v: m[f"{k}/med_rank"] = np.median(v["med_rank"]) return m
def report(self): r = super().report() bsz = max(self.metrics['bsz'], 1) for k in [ 'intent_acc', 'flight_acc', 'name_acc', 'status_acc', 'intent_loss', 'flight_loss', 'name_loss' ]: r[k] = round_sigfigs(self.metrics[k] / bsz, 4) return r
def aggregate_task_reports(reports, tasks, micro=False): """ Aggregate separate task reports into a single report. :param reports: list of report dicts from separate tasks :param tasks: list of tasks :param micro: average per example if True, else average over t :return: aggregated report dicts """ if len(reports) == 1: # singular task return reports[0] # multiple tasks, aggregate metrics metrics = {} exs = {} total_report = {'tasks': {}} # collect metrics from all reports for i, report in enumerate(reports): total_report['tasks'][tasks[i]] = report for metric, val in report.items(): if metric == 'exs': exs[tasks[i]] = val else: metrics.setdefault(metric, {})[tasks[i]] = val # now aggregate total_exs = sum(exs.values()) total_report['exs'] = total_exs for metric, task_vals in metrics.items(): if all([isinstance(v, Number) for v in task_vals.values()]): if micro: # average over the number of examples vals = [task_vals[task] * exs[task] for task in tasks] total_report[metric] = round_sigfigs(sum(vals) / total_exs, 4) else: # macro # average over tasks vals = task_vals.values() total_report[metric] = round_sigfigs(sum(vals) / len(vals), 4) # add a warning describing how metrics were averaged across tasks. total_report['warning'] = 'metrics are averaged across tasks' if micro: total_report[ 'warning'] += ' and weighted by the number of examples ' 'per task' return total_report
def _nice_format(self, dictionary): rounded = {} for k, v in dictionary.items(): if isinstance(v, dict): rounded[k] = self._nice_format(v) elif isinstance(v, float): rounded[k] = round_sigfigs(v, 4) else: rounded[k] = v return rounded
def report(self): m = {} if self.metrics['num_tokens'] > 0: m['loss'] = self.metrics['loss'] / self.metrics['num_tokens'] m['ppl'] = math.exp(m['loss']) if self.metrics['lm_num_tokens'] > 0: m['lmloss'] = self.metrics['lmloss'] / self.metrics['lm_num_tokens'] m['lmppl'] = math.exp(m['lmloss']) for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats m[k] = round_sigfigs(v, 4) return m
def report(self): """ Report metrics from model's perspective. """ m = TorchAgent.report(self) # Skip TorchRankerAgent; totally redundant examples = self.metrics['examples'] if examples > 0: m['examples'] = examples if 'dialog' in self.subtasks and self.metrics['dia_exs'] > 0: m['dia_loss'] = self.metrics['dia_loss'] / self.metrics[ 'dia_exs'] m['dia_rank'] = self.metrics['dia_rank'] / self.metrics[ 'dia_exs'] m['dia_acc'] = self.metrics['dia_correct'] / self.metrics[ 'dia_exs'] m['dia_exs'] = self.metrics['dia_exs'] if 'feedback' in self.subtasks and self.metrics['fee_exs'] > 0: m['fee_loss'] = self.metrics['fee_loss'] / self.metrics[ 'fee_exs'] m['fee_rank'] = self.metrics['fee_rank'] / self.metrics[ 'fee_exs'] m['fee_acc'] = self.metrics['fee_correct'] / self.metrics[ 'fee_exs'] m['fee_exs'] = self.metrics['fee_exs'] m['fee_exs'] = self.metrics['fee_exs'] if 'satisfaction' in self.subtasks and self.metrics['sat_exs'] > 0: tp = self.metrics['sat_tp'] tn = self.metrics['sat_tn'] fp = self.metrics['sat_fp'] fn = self.metrics['sat_fn'] assert tp + tn + fp + fn == self.metrics['sat_exs'] m['sat_loss'] = self.metrics['sat_loss'] / self.metrics[ 'sat_exs'] m['sat_pr'] = tp / (tp + fp + EPS) m['sat_re'] = tp / (tp + fn + EPS) pr = m['sat_pr'] re = m['sat_re'] m['sat_f1'] = (2 * pr * re) / (pr + re) if (pr and re) else 0.0 m['sat_acc'] = (tp + tn) / self.metrics['sat_exs'] m['sat_exs'] = self.metrics['sat_exs'] for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats if isinstance(v, float): m[k] = round_sigfigs(v, 4) else: m[k] = v return m
def aggregate_metrics(reporters): """ Aggregate metrics from multiple reports. """ # reporters is a list of teachers or worlds m = {} m['tasks'] = {} sums = {} num_tasks = 0 total = 0 for i in range(len(reporters)): task_id = reporters[i].getID() task_report = reporters[i].report() for each_metric, value in task_report.items(): if isinstance(value, float): sums[each_metric] = 0.0 m[each_metric] = 0.0 elif isinstance(value, Number): sums[each_metric] = 0 m[each_metric] = 0 for i in range(len(reporters)): task_id = reporters[i].getID() task_report = reporters[i].report() while task_id in m['tasks']: # prevent name clobbering if using multiple tasks with same ID task_id += '_' m['tasks'][task_id] = task_report total += task_report.get('exs', 0) found_any = False for k in sums.keys(): if k in task_report: sums[k] += task_report[k] found_any = True if found_any: num_tasks += 1 m['exs'] = total m['accuracy'] = 0 if num_tasks > 0: for k in sums.keys(): m[k] = round_sigfigs(sums[k] / num_tasks, 4) return m
def report(self): """Report loss and mean_rank from model's perspective.""" base = super().report() m = {} examples = self.metrics['examples'] if examples > 0: m['examples'] = examples m['loss'] = self.metrics['loss'] m['mean_loss'] = self.metrics['loss'] / examples batch_train = self.candidates == 'batch' and self.is_training if not self.is_training or self.opt.get( 'train_predict') or batch_train: m['mean_rank'] = self.metrics['rank'] / examples m['mrr'] = self.metrics['mrr'] / examples if batch_train: m['train_accuracy'] = self.metrics['train_accuracy'] / examples for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats base[k] = round_sigfigs(v, 4) return base
def report(self): """ Report loss and perplexity from model's perspective. Note that this includes predicting __END__ and __UNK__ tokens and may differ from a truly independent measurement. """ m = {} m["num_tokens"] = self.counts["num_tokens"] # m["num_batches"] = self.counts["num_batches"] # m["loss"] = self.metrics["loss"] / m["num_batches"] # m["base_loss"] = self.metrics["base_loss"] / m["num_batches"] m["acc"] = self.metrics["acc"] / m["num_tokens"] m["auc"] = self.metrics["auc"] / m["num_tokens"] # Top-k recommendation Recall for x in sorted(self.metrics): if x.startswith("recall") and self.counts[x] > 200: m[x] = self.metrics[x] / self.counts[x] m["num_tokens_" + x] = self.counts[x] for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats m[k] = round_sigfigs(v, 4) return m
def report(self): base = super().report() m = dict() if self.metrics['loss_G_cnt'] > 0: m['loss_G'] = self.metrics['loss_G'] / self.metrics['loss_G_cnt'] if self.metrics['loss_D_cnt'] > 0: m['loss_D'] = self.metrics['loss_D'] / self.metrics['loss_D_cnt'] if self.metrics['kl_loss_cnt'] > 0: m['kl_loss'] = self.metrics['kl_loss'] / self.metrics['kl_loss_cnt'] if self.metrics['bow_loss_cnt'] > 0: m['bow_loss'] = self.metrics['bow_loss'] / self.metrics[ 'bow_loss_cnt'] if 'loss_G' in m and 'loss_D' in m: m['to_minimize'] = m['loss_G'] + m['loss_D'] for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats base[k] = round_sigfigs(v, 4) return base
def report(self): base = super().report() m = dict() if self.metrics['total_unigram_cnt'] > 0: m['dist_1_cnt'] = len(self.metrics['dist_unigram_tokens']) m['dist_1_ratio'] = m['dist_1_cnt'] / self.metrics[ 'total_unigram_cnt'] if self.metrics['total_bigram_cnt'] > 0: m['dist_2_cnt'] = len(self.metrics['dist_bigram_tokens']) m['dist_2_ratio'] = m['dist_2_cnt'] / self.metrics[ 'total_bigram_cnt'] if self.metrics['total_trigram_cnt'] > 0: m['dist_3_cnt'] = len(self.metrics['dist_trigram_tokens']) m['dist_3_ratio'] = m['dist_3_cnt'] / self.metrics[ 'total_trigram_cnt'] if self.metrics['intra_unigram_cnt'] > 0: m['intra_dist_1'] = self.metrics['intra_unigram'] / self.metrics[ 'intra_unigram_cnt'] if self.metrics['intra_bigram_cnt'] > 0: m['intra_dist_2'] = self.metrics['intra_bigram'] / self.metrics[ 'intra_bigram_cnt'] if self.metrics['intra_trigram_cnt'] > 0: m['intra_dist_3'] = self.metrics['intra_trigram'] / self.metrics[ 'intra_trigram_cnt'] if self.metrics['response_length_cnt'] > 0: m['response_length'] = self.metrics[ 'response_length'] / self.metrics['response_length_cnt'] if self.metrics['embed_avg_cnt'] > 0: m['embed_avg'] = self.metrics['embed_avg'] / self.metrics[ 'embed_avg_cnt'] if self.metrics['embed_extrema_cnt'] > 0: m['embed_extrema'] = self.metrics['embed_extrema'] / self.metrics[ 'embed_extrema_cnt'] if self.metrics['embed_greedy_cnt'] > 0: m['embed_greedy'] = self.metrics['embed_greedy'] / self.metrics[ 'embed_greedy_cnt'] if self.metrics['embed_coh_cnt'] > 0: m['embed_coh'] = self.metrics['embed_coh'] / self.metrics[ 'embed_coh_cnt'] # Entropy if self.metrics['sent_entropy_uni_cnt'] > 0: m['sent_entropy_uni'] = self.metrics[ 'sent_entropy_uni'] / self.metrics['sent_entropy_uni_cnt'] if self.metrics['sent_entropy_bi_cnt'] > 0: m['sent_entropy_bi'] = self.metrics[ 'sent_entropy_bi'] / self.metrics['sent_entropy_bi_cnt'] if self.metrics['sent_entropy_tri_cnt'] > 0: m['sent_entropy_tri'] = self.metrics[ 'sent_entropy_tri'] / self.metrics['sent_entropy_tri_cnt'] if self.metrics['word_entropy_uni_cnt'] > 0: m['word_entropy_uni'] = self.metrics[ 'word_entropy_uni'] / self.metrics['word_entropy_uni_cnt'] if self.metrics['word_entropy_bi_cnt'] > 0: m['word_entropy_bi'] = self.metrics[ 'word_entropy_bi'] / self.metrics['word_entropy_bi_cnt'] if self.metrics['word_entropy_tri_cnt'] > 0: m['word_entropy_tri'] = self.metrics[ 'word_entropy_tri'] / self.metrics['word_entropy_tri_cnt'] # -- Ground-truth metrics if self.metrics['human_total_unigram_cnt'] > 0: m['human_dist_1_cnt'] = len( self.metrics['human_dist_unigram_tokens']) m['human_dist_1_ratio'] = m['human_dist_1_cnt'] / self.metrics[ 'human_total_unigram_cnt'] if self.metrics['human_total_bigram_cnt'] > 0: m['human_dist_2_cnt'] = len( self.metrics['human_dist_bigram_tokens']) m['human_dist_2_ratio'] = m['human_dist_2_cnt'] / self.metrics[ 'human_total_bigram_cnt'] if self.metrics['human_total_trigram_cnt'] > 0: m['human_dist_3_cnt'] = len( self.metrics['human_dist_trigram_tokens']) m['human_dist_3_ratio'] = m['human_dist_3_cnt'] / self.metrics[ 'human_total_trigram_cnt'] if self.metrics['human_intra_unigram_cnt'] > 0: m['human_intra_dist_1'] = self.metrics[ 'human_intra_unigram'] / self.metrics['human_intra_unigram_cnt'] if self.metrics['human_intra_bigram_cnt'] > 0: m['human_intra_dist_2'] = self.metrics[ 'human_intra_bigram'] / self.metrics['human_intra_bigram_cnt'] if self.metrics['human_intra_trigram_cnt'] > 0: m['human_intra_dist_3'] = self.metrics[ 'human_intra_trigram'] / self.metrics['human_intra_trigram_cnt'] if self.metrics['human_response_length_cnt'] > 0: m['human_response_length'] = self.metrics[ 'human_response_length'] / self.metrics[ 'human_response_length_cnt'] if self.metrics['human_embed_coh_cnt'] > 0: m['human_embed_coh'] = self.metrics[ 'human_embed_coh'] / self.metrics['human_embed_coh_cnt'] if self.metrics['human_sent_entropy_uni_cnt'] > 0: m['human_sent_entropy_uni'] = self.metrics[ 'human_sent_entropy_uni'] / self.metrics[ 'human_sent_entropy_uni_cnt'] if self.metrics['human_sent_entropy_bi_cnt'] > 0: m['human_sent_entropy_bi'] = self.metrics[ 'human_sent_entropy_bi'] / self.metrics[ 'human_sent_entropy_bi_cnt'] if self.metrics['human_sent_entropy_tri_cnt'] > 0: m['human_sent_entropy_tri'] = self.metrics[ 'human_sent_entropy_tri'] / self.metrics[ 'human_sent_entropy_tri_cnt'] if self.metrics['human_word_entropy_uni_cnt'] > 0: m['human_word_entropy_uni'] = self.metrics[ 'human_word_entropy_uni'] / self.metrics[ 'human_word_entropy_uni_cnt'] if self.metrics['human_word_entropy_bi_cnt'] > 0: m['human_word_entropy_bi'] = self.metrics[ 'human_word_entropy_bi'] / self.metrics[ 'human_word_entropy_bi_cnt'] if self.metrics['human_word_entropy_tri_cnt'] > 0: m['human_word_entropy_tri'] = self.metrics[ 'human_word_entropy_tri'] / self.metrics[ 'human_word_entropy_tri_cnt'] if not self.model.training: # TODO: add other metrics and balance these metrics m['total_metric'] = \ (-base.get('ppl' if 'ppl' not in self.metrics_exclude_from_total_metric else 'NON_EXIST', 0) * 0.25) / 100 + \ (m.get('dist_1_ratio', 0) + m.get('dist_2_ratio', 0) + clip_value(m.get('dist_3_ratio', 0), 0.001)) + \ (m.get('embed_avg', 0) + m.get('embed_greedy', 0) + m.get('embed_extrema', 0) + m.get('embed_coh', 0)) + \ (m.get('intra_dist_1', 0) + m.get('intra_dist_2', 0) + m.get('intra_dist_3', 0)) / 10 + \ (m.get('word_entropy_uni', 0) + m.get('word_entropy_bi', 0) + m.get('word_entropy_tri', 0)) / 50 + \ (m.get('response_length' if 'response_length' not in self.metrics_exclude_from_total_metric else 'NON_EXIST', 0)) / self.max_response_len # sent_entropy is strong correlated with word_entropy, so we only compute one of them # m.get('sent_entropy_uni', 0) + m.get('sent_entropy_bi', 0) + m.get('sent_entropy_tri', 0) for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats base[k] = round_sigfigs(v, 5) return base
def eval_ppl(opt, build_dict=None, dict_file=None): """ Evaluates the the perplexity of a model. This uses a dictionary which implements the following functions: - tokenize(text): splits string up into list of tokens - __in__(text): checks whether dictionary contains a token - keys(): returns an iterator over all tokens in the dictionary :param opt: option dict :param build_dict: function which returns a dictionary class implementing the functions above. :param dict_file: file used when loading the dictionary class set via the "dictionary_class" argument (defaults to parlai.core.dict:DictionaryAgent). Either build_dict or dict_file must be set (both default to None) to determine the dictionary used for the evaluation. """ if not build_dict and not dict_file: raise RuntimeError('eval_ppl script either needs a dictionary build ' 'function or a dictionary file.') if build_dict: dict_agent = build_dict() else: dict_opt = copy.deepcopy(opt) dict_opt['model'] = dict_opt.get('dictionary_class', 'parlai.core.dict:DictionaryAgent') dict_opt['model_file'] = dict_file if 'override' in dict_opt: del dict_opt['override'] dict_agent = create_agent(dict_opt, requireModelExists=True) # create agents agent = create_agent(opt) world = create_task(opt, [agent, dict_agent], default_world=PerplexityWorld) # set up logging log_time = Timer() tot_time = 0 while not world.epoch_done(): world.parley() # process an example if log_time.time() > 1: # log every 1 sec tot_time += log_time.time() report = world.report() print('{}s elapsed, {}%% complete, {}'.format( int(tot_time), round_sigfigs(report['exs'] / world.num_examples() * 100, 3), report, )) log_time.reset() print('EPOCH DONE') tot_time += log_time.time() final_report = world.report() print('{}s elapsed: {}'.format(int(tot_time), final_report)) print("============================") print("FINAL PPL: " + str(final_report['ppl'])) if final_report.get('ppl', 0) == float('inf'): print('Note: you got inf perplexity. Consider adding (or raising) the ' 'minimum probability you assign to each possible word. If you ' 'assign zero probability to the correct token in the evaluation ' 'vocabulary, you get inf probability immediately.')
def clip(f): return round_sigfigs(f)
def _get_bins(self, counts: Counter): c = Counter() for k, v in counts.items(): c.update({self.truebins.get(k, 'never'): v}) t = sum(c.values()) return {k: round_sigfigs(v / t, 4) for k, v in c.items()}