Esempio n. 1
0
 def setUp(self):
     self.data = BinaryClassifierData([\
         (0.1, 0),
         (0.2, 0),
         (0.3, 0),
         (0.4, 1),
         (0.5, 0),
         (0.6, 1),
         (0.7, 1),
         (0.8, 1),
         (0.9, 1)
     ])
Esempio n. 2
0
 def data(self, data):
     """Sets the data points from which we generate the curve."""
     if isinstance(data, BinaryClassifierData):
         self._data = data
     else:
         self._data = BinaryClassifierData(data)
     self._calculate_points()
Esempio n. 3
0
    def run_tests(self):
        """Runs pairwise significance tests on the datasets found in
        ``self.data``."""
        data = self.data
        expected = data["__class__"]

        keys = sorted(data.keys())
        keys.remove("__class__")

        for key in keys:
            self.log.info("Preparing dataset for %s..." % key)
            data[key] = BinaryClassifierData(zip(data[key], expected), title=key)

        self.log.info("Running significance tests...")
        significance_test = PairedPermutationTest(self.curve_class)
        for key1, key2 in itertools.product(keys, keys):
            if key1 >= key2:
                continue
            diff, p_value = significance_test.test(data[key1], data[key2])
            if p_value < 0.01:
                stars = "***"
            elif p_value < 0.05:
                stars = "**"
            elif p_value < 0.1:
                stars = "*"
            else:
                stars = ""
            self.log.info("%3s   d=%8.3g   p=%8.3g   %s vs %s" %
                          (stars, diff, p_value, key1, key2))
Esempio n. 4
0
class BinaryClassifierDataTest(unittest.TestCase):
    def setUp(self):
        self.data = BinaryClassifierData([\
            (0.1, 0),
            (0.2, 0),
            (0.3, 0),
            (0.4, 1),
            (0.5, 0),
            (0.6, 1),
            (0.7, 1),
            (0.8, 1),
            (0.9, 1)
        ])

    def test_get_confusion_matrix(self):
        mat = self.data.get_confusion_matrix(0.2)
        self.assertEqual(repr(mat), "BinaryConfusionMatrix(tp=5, fp=3, fn=0, tn=1)")
        mat = self.data.get_confusion_matrix(0.5)
        self.assertEqual(repr(mat), "BinaryConfusionMatrix(tp=4, fp=1, fn=1, tn=3)")
        mat = self.data.get_confusion_matrix(0.75)
        self.assertEqual(repr(mat), "BinaryConfusionMatrix(tp=2, fp=0, fn=3, tn=4)")
        mat = self.data.get_confusion_matrix(1.0)
        self.assertEqual(repr(mat), "BinaryConfusionMatrix(tp=0, fp=0, fn=5, tn=4)")

    def test_iter_confusion_matrices(self):
        expected = """\
        tp=5, fp=4, fn=0, tn=0
        tp=5, fp=3, fn=0, tn=1
        tp=5, fp=2, fn=0, tn=2
        tp=5, fp=1, fn=0, tn=3
        tp=4, fp=1, fn=1, tn=3
        tp=4, fp=0, fn=1, tn=4
        tp=3, fp=0, fn=2, tn=4
        tp=2, fp=0, fn=3, tn=4
        tp=1, fp=0, fn=4, tn=4
        tp=0, fp=0, fn=5, tn=4"""
        expected = ["BinaryConfusionMatrix(%s)" % line \
                    for line in dedent(expected).split("\n")]
        for (threshold, matrix), expected in \
                izip_longest(self.data.iter_confusion_matrices(), expected):
            self.assertEqual(repr(matrix), expected)
            self.assertEqual(matrix, self.data.get_confusion_matrix(threshold))

        expected = """\
        tp=5, fp=4, fn=0, tn=0
        tp=5, fp=2, fn=0, tn=2
        tp=4, fp=1, fn=1, tn=3
        tp=2, fp=0, fn=3, tn=4
        tp=0, fp=0, fn=5, tn=4"""
        expected = ["BinaryConfusionMatrix(%s)" % line \
                    for line in dedent(expected).split("\n")]
        for (threshold, matrix), expected in \
                izip_longest(self.data.iter_confusion_matrices(4), expected):
            self.assertEqual(repr(matrix), expected)
            self.assertEqual(matrix, self.data.get_confusion_matrix(threshold))
Esempio n. 5
0
    def print_scores_for_curve(self, curve_class):
        """Calculates AUC scores for curves given by `curve_class` for all
        the data in `self.data`.

        `curve_class` is a subclass of `BinaryClassifierPerformanceCurve`.
        `self.data` must be a dict of lists, and the ``__class__`` key of
        `self.data` must map to the expected classes of elements.
        """
        data = self.data
        expected = data["__class__"]

        keys = sorted(data.keys())
        keys.remove("__class__")

        print("Calculating AUCs for %s..." % curve_class.get_friendly_name())
        for key in keys:
            observed = data[key]

            bc_data = BinaryClassifierData(zip(observed, expected), title=key)
            auc = curve_class(bc_data).auc()
            print("  AUC[%s] = %.4f" % (key, auc))
        print("")
Esempio n. 6
0
    def get_figure_for_curves(self, curve_class):
        """Plots curves given by `curve_class` for all the data in `self.data`.
        `curve_class` is a subclass of `BinaryClassifierPerformanceCurve`.
        `self.data` must be a dict of lists, and the ``__class__`` key of
        `self.data` must map to the expected classes of elements. Returns an
        instance of `matplotlib.figure.Figure`."""
        fig, axes = None, None

        data = self.data
        expected = data["__class__"]

        keys = sorted(data.keys())
        keys.remove("__class__")

        styles = ["r-",  "b-",  "g-",  "c-",  "m-",  "y-",  "k-", \
                  "r--", "b--", "g--", "c--", "m--", "y--", "k--"]

        # Plot the curves
        line_handles, labels, aucs = [], [], []
        for key, style in izip(keys, cycle(styles)):
            self.log.info("Calculating %s for %s..." %
                          (curve_class.get_friendly_name(), key))
            observed = data[key]

            bc_data = BinaryClassifierData(zip(observed, expected), title=key)
            curve = curve_class(bc_data)

            if self.options.resampling:
                curve.resample(x / 2000. for x in xrange(2001))

            if self.options.show_auc:
                aucs.append(curve.auc())
                labels.append("%s, AUC=%.4f" % (key, aucs[-1]))
            else:
                labels.append(key)

            if not fig:
                dpi = self.options.dpi
                fig = curve.get_empty_figure(dpi=dpi,
                                             figsize=parse_size(
                                                 self.options.size, dpi=dpi))
                axes = fig.get_axes()[0]

            line_handle = curve.plot_on_axes(axes, style=style, legend=False)
            line_handles.append(line_handle)

        if aucs:
            # Sort the labels of the legend in decreasing order of AUC
            indices = sorted(xrange(len(aucs)),
                             key=aucs.__getitem__,
                             reverse=True)
            line_handles = [line_handles[i] for i in indices]
            labels = [labels[i] for i in indices]
            aucs = [aucs[i] for i in indices]

        if axes:
            legend_pos = "best"

            # Set logarithmic axes if needed
            if "x" in self.options.log_scale:
                axes.set_xscale("log")
                legend_pos = "upper left"
            if "y" in self.options.log_scale:
                axes.set_yscale("log")

            # Plot the legend
            axes.legend(line_handles, labels, loc=legend_pos)

        return fig