def calculate_metric(self): total_exact_match = 0 pred_exact_match = 0 num_samples = len(self.all_target_trees) for (beam_pred, target) in zip(self.all_beam_preds, self.all_target_trees): for (index, pred) in enumerate(beam_pred): if self._compare_target_prediction_tokens(pred, target): total_exact_match += 1 if index == 0: pred_exact_match += 1 break exact_match = round(safe_division(pred_exact_match, num_samples) * 100.0, 2) exact_match_top_k = round( safe_division(total_exact_match, num_samples) * 100.0, 2 ) k = 0 if len(self.all_preds) == 0 else len(self.all_beam_preds[0]) length_metrics, length_reports = compute_length_metrics( self.all_target_lens, self.all_target_length_preds, self.select_length_beam ) return MaskedSeq2SeqTopKMetrics( loss=self.calculate_loss(), exact_match=exact_match, f1=-1, bleu=-1, k=k, exact_match_top_k=exact_match_top_k, f1_top_k=-1, bleu_top_k=-1, length_metrics=length_metrics, length_reports=length_reports, )
def calculate_metric(self): total_exact_match = 0 total_f1 = 0.0 num_samples = len(self.all_targets) trg_vocab = self.tensorizers["trg_seq_tokens"].vocab bleu_scorer = bleu.Scorer( bleu.BleuConfig( pad=trg_vocab.get_pad_index(), eos=trg_vocab.get_eos_index(), unk=trg_vocab.get_unk_index(), )) for (beam_preds, target) in zip(self.all_preds, self.all_targets): pred = beam_preds[0] if self._compare_target_prediction_tokens(pred, target): total_exact_match += 1 total_f1 += compute_f1(pred, target) # Bleu Metric calculation is always done with tensors on CPU or # type checks in fairseq/bleu.py:add() will fail bleu_scorer.add( torch.IntTensor(target).cpu(), torch.IntTensor(pred).cpu()) loss = self.calculate_loss() exact_match = round( safe_division(total_exact_match, num_samples) * 100.0, 2) f1 = round(safe_division(total_f1, num_samples) * 100.0, 2) bleu_score = round( 0.0 if len(self.all_preds) == 0 else bleu_scorer.score(), 2) return Seq2SeqMetrics(loss, exact_match, f1, bleu_score)
def compute_topk_classification_metrics( predictions: Sequence[LabelTopKPrediction], label_names: Sequence[str], loss: float, log_per_label_metrics: bool = True, ) -> ClassificationMetrics: """ A general function that computes classification metrics given a list of label predictions. Args: predictions: Label predictions, including the confidence score for each label. label_names: Indexed label names. loss: Calculated loss. log_per_label_metrics: Report macro prf1 values per label. Returns: ClassificationMetrics which contains various classification metrics. """ num_correct = 0 per_label_confusions = PerLabelConfusions() for _, topk, label in predictions: predicted_labels = list(map(label_names.__getitem__, topk)) expected_label = label_names[label] if expected_label in predicted_labels: num_correct += 1 per_label_confusions.update(expected_label, "TP", 1) else: per_label_confusions.update(expected_label, "FN", 1) per_label_confusions.update(predicted_labels[0], "FP", 1) macro_prf1_metrics = per_label_confusions.compute_metrics( log_per_label_metrics=log_per_label_metrics ) return ClassificationMetrics( accuracy=safe_division(num_correct, len(predictions)), macro_prf1_metrics=macro_prf1_metrics, per_label_soft_scores={}, mcc=None, roc_auc=None, loss=loss, )
def calculate_metric(self): num_correct = 0 total_count = len(self.all_targets) trg_vocab = self.tensorizers["trg_seq_tokens"].vocab bleu_scorer = bleu.Scorer( trg_vocab.get_pad_index(), trg_vocab.get_eos_index(), trg_vocab.get_unk_index(), ) for beam_pred, target in zip(self.all_preds, self.all_targets): pred = beam_pred[0] if self._compare_target_prediction_tokens(pred, target): num_correct = num_correct + 1 # Bleu Metric calculation is always done with tensors on CPU or # type checks in fairseq/bleu.py:add() will fail bleu_scorer.add(torch.IntTensor(target).cpu(), torch.IntTensor(pred).cpu()) bleu_score = 0.0 if len(self.all_preds) == 0 else bleu_scorer.score() accuracy = safe_division(num_correct, total_count) cross_entropy_loss = self.calculate_loss() return Seq2SeqMetrics(accuracy, cross_entropy_loss, bleu_score)