def test_fbeta_multilabel_with_weighted_average(self, device: str):
        self.predictions = self.predictions.to(device)
        self.targets = self.targets.to(device)

        labels = [0, 1]
        fbeta = FBetaMultiLabelMeasure(average="weighted", labels=labels)
        fbeta(self.predictions, self.targets)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]
        fscores = metric["fscore"]

        weighted_precision, weighted_recall, weighted_fscore, _ = precision_recall_fscore_support(
            self.targets.cpu().numpy(),
            torch.where(
                self.predictions >= fbeta._threshold,
                torch.ones_like(self.predictions),
                torch.zeros_like(self.predictions),
            )
            .cpu()
            .numpy(),
            labels=labels,
            average="weighted",
        )

        # check value
        assert_allclose(precisions, weighted_precision)
        assert_allclose(recalls, weighted_recall)
        assert_allclose(fscores, weighted_fscore)
    def test_fbeta_multilabel_with_micro_average(self, device: str):
        self.predictions = self.predictions.to(device)
        self.targets = self.targets.to(device)

        labels = [1, 3]
        fbeta = FBetaMultiLabelMeasure(average="micro", labels=labels)
        fbeta(self.predictions, self.targets)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]
        fscores = metric["fscore"]

        # We keep the expected values in CPU because FBetaMeasure returns them in CPU.
        true_positives = torch.tensor([3, 4], dtype=torch.float32)
        false_positives = torch.tensor([0, 0], dtype=torch.float32)
        false_negatives = torch.tensor([2, 0], dtype=torch.float32)
        mean_true_positive = true_positives.mean()
        mean_false_positive = false_positives.mean()
        mean_false_negative = false_negatives.mean()

        micro_precision = mean_true_positive / (mean_true_positive + mean_false_positive)
        micro_recall = mean_true_positive / (mean_true_positive + mean_false_negative)
        micro_fscore = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)
        # check value
        assert_allclose(precisions, micro_precision)
        assert_allclose(recalls, micro_recall)
        assert_allclose(fscores, micro_fscore)
    def test_fbeta_multilabel_with_mask(self, device: str):
        self.predictions = self.predictions.to(device)
        self.targets = self.targets.to(device)

        mask = torch.tensor([True, True, True, True, True, False], device=device).unsqueeze(-1)

        fbeta = FBetaMultiLabelMeasure()
        fbeta(self.predictions, self.targets, mask)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]
        fscores = metric["fscore"]

        assert_allclose(fbeta._pred_sum.tolist(), [3, 3, 3, 4, 1])
        assert_allclose(fbeta._true_sum.tolist(), [4, 5, 2, 4, 0])
        assert_allclose(fbeta._true_positive_sum.tolist(), [3, 3, 2, 4, 0])

        desired_precisions = [3 / 3, 3 / 3, 2 / 3, 4 / 4, 0 / 1]
        desired_recalls = [3 / 4, 3 / 5, 2 / 2, 4 / 4, 0.00]
        desired_fscores = [
            (2 * p * r) / (p + r) if p + r != 0.0 else 0.0
            for p, r in zip(desired_precisions, desired_recalls)
        ]
        assert_allclose(precisions, desired_precisions)
        assert_allclose(recalls, desired_recalls)
        assert_allclose(fscores, desired_fscores)
    def test_fbeta_multilabel_handles_batch_size_of_one(self, device: str):
        predictions = torch.tensor([[0.2862, 0.5479, 0.1627, 0.2033]], device=device)
        targets = torch.tensor([[0, 1, 0, 0]], device=device)
        mask = torch.tensor([[True]], device=device)

        fbeta = FBetaMultiLabelMeasure()
        fbeta(predictions, targets, mask)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]

        assert_allclose(precisions, [0.0, 1.0, 0.0, 0.0])
        assert_allclose(recalls, [0.0, 1.0, 0.0, 0.0])
    def test_fbeta_multilabel_handles_no_prediction_true_all_class(self, device: str):
        predictions = torch.tensor([[0.65, 0.35], [0.0, 0.0]], device=device)
        # preds = [0, NA]
        targets = torch.tensor([[0, 1], [0, 1]], device=device)

        fbeta = FBetaMultiLabelMeasure()
        fbeta(predictions, targets)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]
        fscores = metric["fscore"]

        assert_allclose(precisions, [0.0, 0.0])
        assert_allclose(recalls, [0.0, 0.0])
        assert_allclose(fscores, [0.0, 0.0])
 def test_multiple_distributed_runs(self):
     predictions = [
         torch.tensor(
             [
                 [0.55, 0.25, 0.10, 0.10, 0.20],
                 [0.10, 0.60, 0.10, 0.95, 0.00],
                 [0.90, 0.80, 0.75, 0.80, 0.00],
             ]
         ),
         torch.tensor(
             [
                 [0.49, 0.50, 0.95, 0.55, 0.00],
                 [0.60, 0.49, 0.60, 0.65, 0.85],
                 [0.85, 0.40, 0.10, 0.20, 0.00],
             ]
         ),
     ]
     targets = [
         torch.tensor([[1, 1, 0, 0, 0], [0, 1, 0, 1, 0], [1, 1, 0, 1, 0]]),
         torch.tensor([[1, 1, 1, 1, 0], [1, 1, 1, 1, 0], [0, 0, 0, 0, 0]]),
     ]
     metric_kwargs = {"predictions": predictions, "gold_labels": targets}
     desired_metrics = {
         "precision": self.desired_precisions,
         "recall": self.desired_recalls,
         "fscore": self.desired_fscores,
     }
     run_distributed_test(
         [-1, -1],
         multiple_runs,
         FBetaMultiLabelMeasure(),
         metric_kwargs,
         desired_metrics,
         exact=False,
     )
    def test_fbeta_multilabel_with_explicit_labels(self, device: str):
        self.predictions = self.predictions.to(device)
        self.targets = self.targets.to(device)

        # same prediction but with and explicit label ordering
        fbeta = FBetaMultiLabelMeasure(labels=[4, 3, 2, 1, 0])
        fbeta(self.predictions, self.targets)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]
        fscores = metric["fscore"]

        desired_precisions = self.desired_precisions[::-1]
        desired_recalls = self.desired_recalls[::-1]
        desired_fscores = self.desired_fscores[::-1]
        # check value
        assert_allclose(precisions, desired_precisions)
        assert_allclose(recalls, desired_recalls)
        assert_allclose(fscores, desired_fscores)
    def test_fbeta_multilabel_metric(self, device: str):
        self.predictions = self.predictions.to(device)
        self.targets = self.targets.to(device)

        fbeta = FBetaMultiLabelMeasure()
        fbeta(self.predictions, self.targets)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]
        fscores = metric["fscore"]

        # check value
        assert_allclose(precisions, self.desired_precisions)
        assert_allclose(recalls, self.desired_recalls)
        assert_allclose(fscores, self.desired_fscores)

        # check type
        assert isinstance(precisions, List)
        assert isinstance(recalls, List)
        assert isinstance(fscores, List)
    def test_fbeta_multilabel_with_macro_average(self, device: str):
        self.predictions = self.predictions.to(device)
        self.targets = self.targets.to(device)

        labels = [0, 1]
        fbeta = FBetaMultiLabelMeasure(average="macro", labels=labels)
        fbeta(self.predictions, self.targets)
        metric = fbeta.get_metric()
        precisions = metric["precision"]
        recalls = metric["recall"]
        fscores = metric["fscore"]

        # We keep the expected values in CPU because FBetaMeasure returns them in CPU.
        macro_precision = torch.tensor(self.desired_precisions)[labels].mean()
        macro_recall = torch.tensor(self.desired_recalls)[labels].mean()
        macro_fscore = torch.tensor(self.desired_fscores)[labels].mean()

        # check value
        assert_allclose(precisions, macro_precision)
        assert_allclose(recalls, macro_recall)
        assert_allclose(fscores, macro_fscore)
    def test_fbeta_multilabel_state(self, device: str):
        self.predictions = self.predictions.to(device)
        self.targets = self.targets.to(device)

        fbeta = FBetaMultiLabelMeasure()
        fbeta(self.predictions, self.targets)

        # check state
        assert_allclose(fbeta._pred_sum.tolist(), self.pred_sum)
        assert_allclose(fbeta._true_sum.tolist(), self.true_sum)
        assert_allclose(fbeta._true_positive_sum.tolist(), self.true_positive_sum)
        assert_allclose(fbeta._true_negative_sum.tolist(), self.true_negative_sum)
        assert_allclose(fbeta._total_sum.tolist(), self.total_sum)
def multiple_runs(
    global_rank: int,
    world_size: int,
    gpu_id: Union[int, torch.device],
    metric: FBetaMultiLabelMeasure,
    metric_kwargs: Dict[str, List[Any]],
    desired_values: Dict[str, Any],
    exact: Union[bool, Tuple[float, float]] = True,
):

    kwargs = {}
    # Use the arguments meant for the process with rank `global_rank`.
    for argname in metric_kwargs:
        kwargs[argname] = metric_kwargs[argname][global_rank]

    for i in range(200):
        metric(**kwargs)

    metric_values = metric.get_metric()

    for key in desired_values:
        assert_allclose(desired_values[key], metric_values[key])
 def test_runtime_errors(self, device: str):
     fbeta = FBetaMultiLabelMeasure()
     # Metric was never called.
     pytest.raises(RuntimeError, fbeta.get_metric)