def test_get_contingencies(self):
        d = self._construct_sparse()
        conts = contingency.get_contingencies(d)

        self.assertEqual(len(conts), 20)

        cont = conts[5]
        self.assertIsInstance(cont, contingency.Discrete)
        np.testing.assert_almost_equal(cont[0], [1, 0, 0])
        np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
        np.testing.assert_almost_equal(cont[2], [1, 0, 0])

        cont = conts[14]
        self.assertIsInstance(cont, contingency.Continuous)
        np.testing.assert_almost_equal(cont[0], [[], []])
        np.testing.assert_almost_equal(cont["b"], [[1], [1]])
        np.testing.assert_almost_equal(cont[2], [[2], [1]])

        conts = contingency.get_contingencies(d, skipDiscrete=True)
        self.assertEqual(len(conts), 10)
        cont = conts[4]
        self.assertIsInstance(cont, contingency.Continuous)
        np.testing.assert_almost_equal(cont[0], [[], []])
        np.testing.assert_almost_equal(cont["b"], [[1], [1]])
        np.testing.assert_almost_equal(cont[2], [[2], [1]])

        conts = contingency.get_contingencies(d, skipContinuous=True)
        self.assertEqual(len(conts), 10)
        cont = conts[5]
        self.assertIsInstance(cont, contingency.Discrete)
        np.testing.assert_almost_equal(cont[0], [1, 0, 0])
        np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
        np.testing.assert_almost_equal(cont[2], [1, 0, 0])
Beispiel #2
0
    def test_get_contingencies(self):
        d = self._construct_sparse()
        conts = contingency.get_contingencies(d)

        self.assertEqual(len(conts), 20)

        cont = conts[5]
        self.assertIsInstance(cont, contingency.Discrete)
        assert_dist_equal(cont[0], [2, 0, 0])
        assert_dist_equal(cont["b"], [0, 1, 1])
        assert_dist_equal(cont[2], [1, 0, 0])

        cont = conts[14]
        self.assertIsInstance(cont, contingency.Continuous)
        assert_dist_equal(cont[0], [[], []])
        assert_dist_equal(cont["b"], [[1], [1]])
        assert_dist_equal(cont[2], [[2], [1]])

        conts = contingency.get_contingencies(d, skip_discrete=True)
        self.assertEqual(len(conts), 10)
        cont = conts[4]
        self.assertIsInstance(cont, contingency.Continuous)
        assert_dist_equal(cont[0], [[], []])
        assert_dist_equal(cont["b"], [[1], [1]])
        assert_dist_equal(cont[2], [[2], [1]])

        conts = contingency.get_contingencies(d, skip_continuous=True)
        self.assertEqual(len(conts), 10)
        cont = conts[5]
        self.assertIsInstance(cont, contingency.Discrete)
        assert_dist_equal(cont[0], [2, 0, 0])
        assert_dist_equal(cont["b"], [0, 1, 1])
        assert_dist_equal(cont[2], [1, 0, 0])
Beispiel #3
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete
                   for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(np.diag(
            contingency.get_contingency(table, table.domain.class_var)))
        nclss = (class_freq != 0).sum()
        if not nclss:
            raise ValueError("Data has no defined target values")

        # Laplacian smoothing considers only classes that appear in the data,
        # in part to avoid cases where the probabilities are affected by empty
        # (or completely spurious) classes that appear because of Orange's reuse
        # of variables. See GH-2943.
        # The corresponding elements of class_probs are set to zero only after
        # mock non-zero values are used in computation of log_cont_prob to
        # prevent division by zero.
        class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss)
        log_cont_prob = [np.log(
            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss)
            / class_prob[:, None])
                         for c in cont]
        class_prob[class_freq == 0] = 0
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
Beispiel #4
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete
                   for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(np.diag(
            contingency.get_contingency(table, table.domain.class_var)))
        nclss = (class_freq != 0).sum()
        if not nclss:
            raise ValueError("Data has no defined target values")

        # Laplacian smoothing considers only classes that appear in the data,
        # in part to avoid cases where the probabilities are affected by empty
        # (or completely spurious) classes that appear because of Orange's reuse
        # of variables. See GH-2943.
        # The corresponding elements of class_probs are set to zero only after
        # mock non-zero values are used in computation of log_cont_prob to
        # prevent division by zero.
        class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss)
        log_cont_prob = [np.log(
            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss)
            / class_prob[:, None])
                         for c in cont]
        class_prob[class_freq == 0] = 0
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
Beispiel #5
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(isinstance(var, DiscreteVariable) for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.diag(contingency.get_contingency(table, table.domain.class_var))
        return NaiveBayesModel(cont, class_freq, table.domain)
Beispiel #6
0
    def test_contingencies(self):
        iris = SqlTable(self.conn, self.iris, inspect_values=True)
        iris.domain = Domain(iris.domain[:2] + (EqualWidth()(iris, iris.domain['sepal width']),),
                             iris.domain['iris'])

        conts = get_contingencies(iris)
        self.assertEqual(len(conts), 3)
        self.assertIsInstance(conts[0], Continuous)
        self.assertIsInstance(conts[1], Continuous)
        self.assertIsInstance(conts[2], Discrete)
Beispiel #7
0
    def test_contingencies(self):
        iris = SqlTable(self.conn, self.iris, inspect_values=True)
        iris.domain = Domain(iris.domain[:2] + (EqualWidth()(iris, iris.domain['sepal width']),),
                             iris.domain['iris'])

        conts = get_contingencies(iris)
        self.assertEqual(len(conts), 3)
        self.assertIsInstance(conts[0], Continuous)
        self.assertIsInstance(conts[1], Continuous)
        self.assertIsInstance(conts[2], Discrete)
Beispiel #8
0
    def draw_distributions(self):
        """Draw distributions with discrete attributes"""
        if not (self.show_distributions and self.have_data and self.data_has_discrete_class):
            return
        class_count = len(self.data_domain.class_var.values)
        class_ = self.data_domain.class_var

        # we create a hash table of possible class values (happens only if we have a discrete class)
        if self.domain_contingencies is None:
            self.domain_contingencies = dict(
                zip(
                    [attr for attr in self.data_domain if isinstance(attr, DiscreteVariable)],
                    get_contingencies(self.raw_data, skipContinuous=True),
                )
            )
            self.domain_contingencies[class_] = get_contingency(self.raw_data, class_, class_)

        max_count = max([contingency.max() for contingency in self.domain_contingencies.values()] or [1])
        sorted_class_values = get_variable_values_sorted(self.data_domain.class_var)

        for axis_idx, attr_idx in enumerate(self.attribute_indices):
            attr = self.data_domain[attr_idx]
            if isinstance(attr, DiscreteVariable):
                continue

            contingency = self.domain_contingencies[attr]
            attr_len = len(attr.values)

            # we create a hash table of variable values and their indices
            sorted_variable_values = get_variable_values_sorted(attr)

            # create bar curve
            for j in range(attr_len):
                attribute_value = sorted_variable_values[j]
                value_count = contingency[:, attribute_value]

                for i in range(class_count):
                    class_value = sorted_class_values[i]

                    color = QColor(self.discrete_palette[i])
                    color.setAlpha(self.alpha_value)

                    width = float(value_count[class_value] * 0.5) / float(max_count)
                    y_off = float(1.0 + 2.0 * j) / float(2 * attr_len)
                    height = 0.7 / float(class_count * attr_len)

                    y_low_bottom = y_off + float(class_count * height) / 2.0 - i * height
                    curve = PolygonCurve(
                        QPen(color),
                        QBrush(color),
                        xData=[axis_idx, axis_idx + width, axis_idx + width, axis_idx],
                        yData=[y_low_bottom, y_low_bottom, y_low_bottom - height, y_low_bottom - height],
                        tooltip=attr.name,
                    )
                    curve.attach(self)
Beispiel #9
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(
            np.diag(contingency.get_contingency(table,
                                                table.domain.class_var)))
        return NaiveBayesModel(cont, class_freq, table.domain)
Beispiel #10
0
    def draw_distributions(self):
        """Draw distributions with discrete attributes"""
        if not (self.show_distributions and self.have_data and self.data_has_discrete_class):
            return
        class_count = len(self.data_domain.class_var.values)
        class_ = self.data_domain.class_var

        # we create a hash table of possible class values (happens only if we have a discrete class)
        if self.domain_contingencies is None:
            self.domain_contingencies = dict(
                zip([attr for attr in self.data_domain if isinstance(attr, DiscreteVariable)],
                    get_contingencies(self.raw_data, skipContinuous=True)))
            self.domain_contingencies[class_] = get_contingency(self.raw_data, class_, class_)

        max_count = max([contingency.max() for contingency in self.domain_contingencies.values()] or [1])
        sorted_class_values = get_variable_values_sorted(self.data_domain.class_var)

        for axis_idx, attr_idx in enumerate(self.attribute_indices):
            attr = self.data_domain[attr_idx]
            if isinstance(attr, DiscreteVariable):
                continue

            contingency = self.domain_contingencies[attr]
            attr_len = len(attr.values)

            # we create a hash table of variable values and their indices
            sorted_variable_values = get_variable_values_sorted(attr)

            # create bar curve
            for j in range(attr_len):
                attribute_value = sorted_variable_values[j]
                value_count = contingency[:, attribute_value]

                for i in range(class_count):
                    class_value = sorted_class_values[i]

                    color = QColor(self.discrete_palette[i])
                    color.setAlpha(self.alpha_value)

                    width = float(value_count[class_value] * 0.5) / float(max_count)
                    y_off = float(1.0 + 2.0 * j) / float(2 * attr_len)
                    height = 0.7 / float(class_count * attr_len)

                    y_low_bottom = y_off + float(class_count * height) / 2.0 - i * height
                    curve = PolygonCurve(QPen(color),
                                         QBrush(color),
                                         xData=[axis_idx, axis_idx + width,
                                                axis_idx + width, axis_idx],
                                         yData=[y_low_bottom, y_low_bottom, y_low_bottom - height,
                                                y_low_bottom - height],
                                         tooltip=attr.name)
                    curve.attach(self)
Beispiel #11
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete
                   for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(np.diag(
            contingency.get_contingency(table, table.domain.class_var)))
        class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq))
        log_cont_prob = [np.log(
            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] +
                                 c.shape[0]) / class_prob[:, None])
                         for c in cont]
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
Beispiel #12
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete
                   for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(np.diag(
            contingency.get_contingency(table, table.domain.class_var)))
        class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq))
        log_cont_prob = [np.log(
            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] +
                                 c.shape[0]) / class_prob[:, None])
                         for c in cont]
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
Beispiel #13
0
 def fit_storage(self, table):
     cont = contingency.get_contingencies(table)
     class_freq = np.diag(
         contingency.get_contingency(table, table.domain.class_var))
     return BayesStorageClassifier(cont, class_freq, table.domain)