コード例 #1
0
 def test_set_range(self):
     report = Report(
         self.data,
         self.cols,
         model_name=self.model_name,
         dataset_name=self.dataset_name,
     )
     report.set_range("f1", 0.1, 0.3)
     for col in report.columns:
         if col.title == "f1":
             self.assertEqual((col.min_val, col.max_val), (0.1, 0.3))
コード例 #2
0
 def test_set_class_codes(self):
     report = Report(
         self.data,
         self.cols,
         model_name=self.model_name,
         dataset_name=self.dataset_name,
     )
     custom_class_codes = ["A", "B", "C"]
     report.set_class_codes(custom_class_codes)
     for col in report.columns:
         if isinstance(col, ClassDistributionColumn):
             self.assertEqual(col.class_codes, custom_class_codes)
コード例 #3
0
    def test_filter(self):
        # Filter by category
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )
        report.filter(categories=["Cat B"])
        actual = report.data
        expected = pd.DataFrame([
            ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812],
            ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312],
        ])
        self.assertTrue(actual.equals(expected))

        # Filter by slice
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )
        report.filter(slices=["Slice A", "Slice C"])
        actual = report.data
        expected = pd.DataFrame([
            ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300],
            ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3],
            ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000],
        ])
        self.assertTrue(actual.equals(expected))
コード例 #4
0
 def test_rename(self):
     report = Report(
         self.data,
         self.cols,
         model_name=self.model_name,
         dataset_name=self.dataset_name,
     )
     category_map = {"Cat C": "Cat D"}
     slice_map = {"Slice A": "Slice D"}
     report.rename(category_map=category_map, slice_map=slice_map)
     actual = report.data
     expected = pd.DataFrame([
         ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300],
         ["Cat D", "Slice D", 0.2, 10, [0.4, 0.2, 0.4], 3],
         ["Cat A", "Slice D", 0.3, 15, [0.1, 0, 0.9], 5000],
         ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812],
         ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312],
     ])
     self.assertTrue(actual.equals(expected))
コード例 #5
0
    def test_init(self):
        # Create a basic report
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )
        self.assertTrue(self.data.equals(report.data))

        # Pass config params
        custom_color_scheme = ["#000000"]
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
            color_scheme=custom_color_scheme,
        )
        self.assertEqual(custom_color_scheme, report.config["color_scheme"])
コード例 #6
0
 def test_display_2(self):
     data = pd.DataFrame([
         [
             "Eval",
             "snli1",
             0.8799999952316284,
             0.876409113407135,
             [0.368, 0.304, 0.328],
             [0.344, 0.288, 0.368],
             125,
         ],
         [
             "Eval",
             "snli2",
             0.8799999952316284,
             0.876409113407135,
             [0.368, 0.304, 0.328],
             [0.344, 0.288, 0.368],
             125,
         ],
         [
             "Eval",
             "snli3",
             0.8799999952316284,
             0.876409113407135,
             [0.368, 0.304, 0.328],
             [0.344, 0.288, 0.368],
             125,
         ],
     ])
     cols = [
         ScoreColumn("F1", min_val=0, max_val=1, is_0_to_1=True),
         ScoreColumn("Accuracy", min_val=0, max_val=1, is_0_to_1=True),
         ClassDistributionColumn("Class Dist", ["e", "n", "c"]),
         ClassDistributionColumn("Pred Dist", ["e", "n", "c"]),
         NumericColumn("Size"),
     ]
     report = Report(data, cols)
     report.figure().show()
コード例 #7
0
    def test_figure(self):
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )

        # Original unsorted data should cause an error
        self.assertRaises(ValueError, report.figure)

        # Sort should resolve that error
        report.sort()
        try:
            report.figure()
        except ValueError:
            self.fail("report.figure() raised ValueError unexpectedly!")
コード例 #8
0
    def create_report(
        self,
        model: Union[Model, str],
        metric_ids: List[str] = None,
    ) -> Report:
        """Generate report from cached metrics for a model.

        Args:
            model: Model or model id. Metrics must have already been computed for
            this model.
            metric_ids (optional): list of metric ids to include in desired order.
            If None, take metrics from sample slice.

        Returns:
            report
        """

        if len(self.slices) == 0:
            raise ValueError("Cannot create report for empty testbench")

        if isinstance(model, Model):
            model = model.identifier
        if model not in self.metrics:
            raise ValueError(
                f"Metrics for model {model} have not been computed yet."
                f" You must first execute one of "
                "the following methods for this model: 'evaluate', "
                "'add_predictions', 'add_metrics'")

        # TODO(Jesse): Need a category for test set

        model_metrics = self.metrics[model]

        # TODO(Jesse): where to put this? Should only need to be called once
        self._human_readable_identifiers()

        if metric_ids is None:
            sample_slice = list(self.slices)[0].identifier
            metric_ids = list(model_metrics[sample_slice].keys())
            sorted_metric_ids = sorted([
                metric_id for metric_id in metric_ids
                if metric_id not in ("class_dist", "pred_dist")
            ])
            if "class_dist" in metric_ids:
                sorted_metric_ids.append("class_dist")
            if "pred_dist" in metric_ids:
                sorted_metric_ids.append("pred_dist")
            metric_ids = sorted_metric_ids

        # Populate columns
        columns = []
        for metric_id in metric_ids:
            if metric_id in ("class_dist", "pred_dist"):
                if self.task is None:
                    class_cds = None
                else:
                    class_names = self.task.output_schema.features[list(
                        self.task.output_schema.columns)[0]].names
                    class_cds = [name[0].upper() for name in class_names]
                columns.append(ClassDistributionColumn(metric_id, class_cds))
            else:
                columns.append(
                    ScoreColumn(metric_id,
                                min_val=0,
                                max_val=1,
                                is_0_to_1=True))
        columns.append(NumericColumn("Size"))

        category_names = {
            GENERIC: "DataPanel",
            SUBPOPULATION: "SubPop",
            ATTACK: "Attack",
            AUGMENTATION: "Augment",
            CURATION: "Eval",
        }

        # Populate data
        data = []
        for sl in self.slices:
            slice_name = self.ident_mapping[sl.identifier]
            slice_size = len(sl)
            slice_category = category_names.get(sl.category,
                                                sl.category.capitalize())
            row = []
            row.append(slice_category)
            row.append(slice_name)
            if sl.identifier not in model_metrics:
                raise ValueError(
                    f"Metrics for model {model} and slice {sl.identifier}"
                    f"have not yet been computed.")
            slice_metrics = model_metrics[sl.identifier]
            for metric_id in metric_ids:
                row.append(slice_metrics[metric_id])
            row.append(slice_size)
            data.append(row)

        # TODO(karan): generalize aggregation
        # slice_metrics = tz.merge_with(np.mean, slice_metrics)
        # Task-dependent model predictions
        # TODO(karan): e.g. average class distribution predicted, figure out how to
        #  put this in
        # Task-dependent sl information
        # TODO(karan): e.g. class distribution

        df = pd.DataFrame(data)

        report = Report(data=df,
                        columns=columns,
                        model_name=model,
                        dataset_name=self.dataset_id)
        report.sort(category_order=dict((cat, i) for i, cat in enumerate(
            [SUBPOPULATION, AUGMENTATION, CURATION, ATTACK, GENERIC])))
        return report
コード例 #9
0
    def create_report(
        self,
        models: List[str] = None,
        aggregator_columns: Dict[str, ReportColumn] = None,
    ) -> Report:
        """Generate a report for models in the bench.

        Args:
            models (List[str]): names of one or more models that are in the devbench.
            aggregator_columns (Dict[str, (ReportColumn, dict)]):
                dict mapping aggregator names to a tuple.

                The first entry of the tuple is the ReportColumn that should be
                used for visualization. The second entry is a dict of kwargs that
                will be passed to the ReportColumn using
                `ReportColumn.__init__(..., **kwargs)`.

                For instance,
                >>> devbench.create_report(
                >>>     models=['BERT'],
                >>>     aggregator_columns={
                >>>         'accuracy': (ScoreColumn, {'min_val': 0.3})
                >>>     }
                >>> )

                By default, aggregators will be displayed as a ScoreColumn
                with `min_val=0`, `max_val=1` and `is_0_to_1=True`.

        Returns:
            a Report, summarizing the performance of the models.
        """

        if len(self.slices) == 0:
            raise ValueError("No slices found in Bench. Cannot create report.")

        if models is not None:
            for model in models:
                assert model in self.metrics, f"Model {model} not found."
        else:
            # Use all the models that are available
            models = list(self.metrics.keys())

        # Set identifiers to be human readable
        self._human_readable_identifiers()

        # Get the list of aggregators that are shared by `models`
        shared_aggregators = list(self._shared_aggregators(models))

        # Populate columns
        columns = []
        for model in models:
            for aggregator in shared_aggregators:
                if aggregator_columns and aggregator in aggregator_columns:
                    column_type, column_kwargs = aggregator_columns[aggregator]
                else:
                    column_type = ScoreColumn
                    column_kwargs = dict(min_val=0, max_val=1, is_0_to_1=True)
                columns.append(column_type(f"{model}-{aggregator}", **column_kwargs))
        columns.append(NumericColumn("Size"))

        category_names = {
            GENERIC: "Slice",
            SUBPOPULATION: "SubPop",
            ATTACK: "Attack",
            AUGMENTATION: "Augment",
            CURATION: "Eval",
        }

        # Populate data
        data = []
        for sl in self.slices:
            slice_name = self.ident_mapping[sl.identifier]
            slice_size = len(sl)
            slice_category = category_names.get(sl.category, sl.category.capitalize())

            row = [slice_category, slice_name]

            for model in models:
                model_metrics = self.metrics[model]
                if sl.identifier not in model_metrics:
                    continue
                slice_metrics = model_metrics[sl.identifier]
                for agg in shared_aggregators:
                    row.append(slice_metrics[agg])

            row.append(slice_size)
            data.append(row)

        df = pd.DataFrame(data)

        report = Report(
            data=df,
            columns=columns,
        )
        report.sort(
            category_order=dict(
                (cat, i)
                for i, cat in enumerate(
                    [SUBPOPULATION, AUGMENTATION, CURATION, ATTACK, GENERIC]
                )
            )
        )
        return report
コード例 #10
0
    def test_sort(self):
        # Sort alphabetically
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )
        report.sort()
        actual = report.data
        expected = pd.DataFrame([
            ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000],
            ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300],
            ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812],
            ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312],
            ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3],
        ])
        self.assertTrue(actual.equals(expected))

        # Sort by specified category order
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )
        report.sort(category_order={
            "Cat B": 0,
            "Cat C": 2,
            "Cat A": 1,
        })
        actual = report.data
        expected = pd.DataFrame([
            ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812],
            ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312],
            ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000],
            ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300],
            ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3],
        ])
        self.assertTrue(actual.equals(expected))

        # Sort by specified slice order
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )
        report.sort(slice_order={
            "Slice D": 0,
            "Slice C": 1,
            "Slice B": 2,
            "Slice A": 3
        })
        actual = report.data
        expected = pd.DataFrame([
            ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300],
            ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000],
            ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312],
            ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812],
            ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3],
        ])
        self.assertTrue(actual.equals(expected))

        # Sort by specified category order and slice order
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )
        report.sort(
            category_order={
                "Cat B": 0,
                "Cat C": 2,
                "Cat A": 1,
            },
            slice_order={
                "Slice D": 0,
                "Slice C": 1,
                "Slice B": 2,
                "Slice A": 3
            },
        )
        actual = report.data
        expected = pd.DataFrame([
            ["Cat B", "Slice D", 0.5, 25, [0.3, 0.2, 0.5], 13312],
            ["Cat B", "Slice B", 0.4, 20, [0.5, 0.4, 0.1], 812],
            ["Cat A", "Slice C", 0.1, 5, [0.1, 0.2, 0.7], 300],
            ["Cat A", "Slice A", 0.3, 15, [0.1, 0, 0.9], 5000],
            ["Cat C", "Slice A", 0.2, 10, [0.4, 0.2, 0.4], 3],
        ])
        self.assertTrue(actual.equals(expected))
コード例 #11
0
    def test_display(self):
        report = Report(
            self.data,
            self.cols,
            model_name=self.model_name,
            dataset_name=self.dataset_name,
        )

        report.sort()
        figure = report.figure()
        figure.show()

        report.sort(category_order={"Cat C": 1, "Cat A": 2, "Cat B": 3})
        report.rename(slice_map={"Slice A": "A"}, category_map={"Cat B": "B"})
        report.filter(slices=["A", "Slice B", "Slice C"])
        report.set_range("f1", 0.05, 0.45)
        report.update_config(font_size_heading=16)
        figure = report.figure(show_title=True)
        figure.show()