Beispiel #1
0
    def test_isclose(self):
        size = ht.communication.MPI_WORLD.size
        a = ht.float32([[2, 2], [2, 2]], device=ht_device)
        b = ht.float32([[2.00005, 2.00005], [2.00005, 2.00005]],
                       device=ht_device)
        c = ht.zeros((4 * size, 6), split=0, device=ht_device)
        d = ht.zeros((4 * size, 6), split=1, device=ht_device)
        e = ht.zeros((4 * size, 6), device=ht_device)

        self.assertIsInstance(ht.isclose(a, b), ht.DNDarray)
        self.assertTrue(ht.isclose(a, b).shape == (2, 2))
        self.assertFalse(ht.isclose(a, b)[0][0].item())
        self.assertTrue(ht.isclose(a, b, atol=1e-04)[0][1].item())
        self.assertTrue(ht.isclose(a, b, rtol=1e-04)[1][0].item())
        self.assertTrue(ht.isclose(a, 2)[0][1].item())
        self.assertTrue(ht.isclose(a, 2.0)[0][0].item())
        self.assertTrue(ht.isclose(2, a)[1][1].item())
        self.assertTrue(ht.isclose(c, d).shape == (4 * size, 6))
        self.assertTrue(ht.isclose(c, e)[0][0].item())
        self.assertTrue(e.isclose(c)[-1][-1].item())

        # test scalar input
        self.assertIsInstance(ht.isclose(2.0, 2.00005), bool)

        with self.assertRaises(TypeError):
            ht.isclose(a, (2, 2, 2, 2))
        with self.assertRaises(TypeError):
            ht.isclose(a, "?")
        with self.assertRaises(TypeError):
            ht.isclose("?", a)
Beispiel #2
0
    def __partial_fit(self,
                      X,
                      y,
                      classes=None,
                      _refit=False,
                      sample_weight=None):
        """
        Actual implementation of Gaussian NB fitting. Adapted to HeAT from scikit-learn.

        Parameters
        ----------
        X : ht.tensor of shape (n_samples, n_features)
            Training set, where n_samples is the number of samples and
            n_features is the number of features.
        y : ht.tensor of shape (n_samples,)
            Labels for training set.
        classes : ht.tensor of shape (n_classes,), optional (default=None)
            List of all the classes that can possibly appear in the y vector.
            Must be provided at the first call to partial_fit, can be omitted
            in subsequent calls.
        _refit : bool, optional (default=False)
            If true, act as though this were the first time __partial_fit is called
            (ie, throw away any past fitting and start over).
        sample_weight : ht.tensor of shape (n_samples,), optional (default=None)
            Weights applied to individual samples (1. for unweighted).

        Returns
        -------
        self : object
        """

        # TODO: sanitize X and y shape: sanitation/validation module, cf. #468
        n_samples = X.shape[0]
        if X.numdims != 2:
            raise ValueError("expected X to be a 2-D tensor, is {}-D".format(
                X.numdims))
        if y.shape[0] != n_samples:
            raise ValueError(
                "y.shape[0] must match number of samples {}, is {}".format(
                    n_samples, y.shape[0]))

        # TODO: sanitize sample_weight: sanitation/validation module, cf. #468
        if sample_weight is not None:
            if sample_weight.numdims != 1:
                raise ValueError("Sample weights must be 1D tensor")
            if sample_weight.shape != (n_samples, ):
                raise ValueError(
                    "sample_weight.shape == {}, expected {}!".format(
                        sample_weight.shape, (n_samples, )))

        # If the ratio of data variance between dimensions is too small, it
        # will cause numerical errors. To address this, we artificially
        # boost the variance by epsilon, a small fraction of the standard
        # deviation of the largest dimension.
        self.epsilon_ = self.var_smoothing * ht.var(X, axis=0).max()

        if _refit:
            self.classes_ = None

        if self.__check_partial_fit_first_call(classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = ht.zeros((n_classes, n_features),
                                   dtype=X.dtype,
                                   device=X.device)
            self.sigma_ = ht.zeros((n_classes, n_features),
                                   dtype=X.dtype,
                                   device=X.device)

            self.class_count_ = ht.zeros((n_classes, ),
                                         dtype=ht.float64,
                                         device=X.device)

            # Initialise the class prior
            # Take into account the priors
            if self.priors is not None:
                if not isinstance(self.priors, ht.DNDarray):
                    priors = ht.array(self.priors,
                                      dtype=X.dtype,
                                      split=None,
                                      device=X.device)
                else:
                    priors = self.priors
                # Check that the provide prior match the number of classes
                if len(priors) != n_classes:
                    raise ValueError("Number of priors must match number of"
                                     " classes.")
                # Check that the sum is 1
                if not ht.isclose(priors.sum(),
                                  ht.array(1.0, dtype=priors.dtype)):
                    raise ValueError("The sum of the priors should be 1.")
                # Check that the prior are non-negative
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = ht.zeros(len(self.classes_),
                                             dtype=ht.float64,
                                             split=None,
                                             device=X.device)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features {} does not match previous data {}.".
                    format(X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = ht.unique(y, sorted=True)
        if unique_y.split is not None:
            unique_y = ht.resplit(unique_y, axis=None)
        unique_y_in_classes = ht.eq(unique_y, classes)

        if not ht.all(unique_y_in_classes):
            raise ValueError("The target label(s) {} in y do not exist in the "
                             "initial classes {}".format(
                                 unique_y[~unique_y_in_classes], classes))
        for y_i in unique_y:
            # assuming classes.split is None
            if y_i in classes:
                i = ht.where(classes == y_i).item()
            else:
                classes_ext = torch.cat((classes._DNDarray__array,
                                         y_i._DNDarray__array.unsqueeze(0)))
                i = torch.argsort(classes_ext)[-1].item()
            where_y_i = ht.where(y == y_i)._DNDarray__array.tolist()
            X_i = X[where_y_i, :]

            if sample_weight is not None:
                sw_i = sample_weight[where_y_i]
                if 0 not in sw_i.shape:
                    N_i = sw_i.sum()
                else:
                    N_i = 0.0
                    sw_i = None
            else:
                sw_i = None
                N_i = X_i.shape[0]

            new_theta, new_sigma = self.__update_mean_variance(
                self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
                X_i, sw_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += N_i

        self.sigma_[:, :] += self.epsilon_

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        return self
Beispiel #3
0
    def test_fit_iris(self):
        # load sklearn train/test sets and resulting probabilities
        X_train = ht.load(
            "heat/datasets/data/iris_X_train.csv", sep=";", dtype=ht.float64, device=ht_device
        )
        X_test = ht.load(
            "heat/datasets/data/iris_X_test.csv", sep=";", dtype=ht.float64, device=ht_device
        )
        y_train = ht.load(
            "heat/datasets/data/iris_y_train.csv", sep=";", dtype=ht.int64, device=ht_device
        ).squeeze()
        y_test = ht.load(
            "heat/datasets/data/iris_y_test.csv", sep=";", dtype=ht.int64, device=ht_device
        ).squeeze()
        y_pred_proba_sklearn = ht.load(
            "heat/datasets/data/iris_y_pred_proba.csv", sep=";", dtype=ht.float64, device=ht_device
        )

        # test ht.GaussianNB
        from heat.naive_bayes import GaussianNB

        gnb_heat = GaussianNB()
        self.assertEqual(gnb_heat.priors, None)
        with self.assertRaises(AttributeError):
            gnb_heat.classes_
        with self.assertRaises(AttributeError):
            gnb_heat.class_prior_
        with self.assertRaises(AttributeError):
            gnb_heat.epsilon_

        # test GaussianNB locally, no weights
        local_fit = gnb_heat.fit(X_train, y_train)
        self.assert_array_equal(gnb_heat.classes_, np.array([0, 1, 2]))
        local_fit_no_classes = gnb_heat.partial_fit(X_train, y_train, classes=None)
        y_pred_local = local_fit_no_classes.predict(X_test)
        y_pred_proba_local = local_fit.predict_proba(X_test)
        sklearn_class_prior = np.array([0.38666667, 0.26666667, 0.34666667])
        sklearn_epsilon = np.array([3.6399040000000003e-09])
        sklearn_theta = ht.array(
            [
                [4.97586207, 3.35862069, 1.44827586, 0.23448276],
                [5.935, 2.71, 4.185, 1.3],
                [6.77692308, 3.09230769, 5.73461538, 2.10769231],
            ],
            dtype=X_train.dtype,
            device=ht_device,
        )
        sklearn_sigma = ht.array(
            [
                [0.10321047, 0.13208086, 0.01629013, 0.00846612],
                [0.256275, 0.0829, 0.255275, 0.046],
                [0.38869823, 0.10147929, 0.31303255, 0.04763314],
            ],
            dtype=X_train.dtype,
            device=ht_device,
        )
        self.assertIsInstance(y_pred_local, ht.DNDarray)
        self.assertEqual((y_pred_local != y_test).sum(), ht.array(4))
        self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior)
        self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon)
        self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all())
        self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all())
        self.assertTrue(ht.isclose(y_pred_proba_sklearn, y_pred_proba_local, atol=1e-1).all())

        # test GaussianNB when sample_weight is not None, sample_weight not distributed
        sample_weight = ht.ones((y_train.gshape[0]), dtype=ht.float32, split=None)
        local_fit_weight = gnb_heat.fit(X_train, y_train, sample_weight=sample_weight)
        y_pred_local_weight = local_fit_weight.predict(X_test)
        y_pred_proba_local_weight = local_fit_weight.predict_proba(X_test)
        self.assertIsInstance(y_pred_local_weight, ht.DNDarray)
        self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior)
        self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon)
        self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all())
        self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all())
        self.assert_array_equal(y_pred_local_weight, y_pred_local.numpy())
        self.assertTrue(ht.isclose(y_pred_proba_sklearn, y_pred_proba_local_weight).all())

        # test GaussianNB, data and labels distributed along split axis 0
        X_train_split = ht.resplit(X_train, axis=0)
        X_test_split = ht.resplit(X_test, axis=0)
        y_train_split = ht.resplit(y_train, axis=0)
        y_test_split = ht.resplit(y_test, axis=0)
        y_pred_split = gnb_heat.fit(X_train_split, y_train_split).predict(X_test_split)
        self.assert_array_equal(gnb_heat.class_prior_, sklearn_class_prior)
        self.assert_array_equal(gnb_heat.epsilon_, sklearn_epsilon)
        self.assertTrue(ht.isclose(gnb_heat.theta_, sklearn_theta).all())
        self.assertTrue(ht.isclose(gnb_heat.sigma_, sklearn_sigma, atol=1e-1).all())
        self.assert_array_equal(y_pred_split, y_pred_local.numpy())
        self.assertEqual((y_pred_split != y_test_split).sum(), ht.array(4))
        sample_weight_split = ht.ones(y_train_split.gshape[0], dtype=ht.float32, split=0)
        y_pred_split_weight = gnb_heat.fit(
            X_train_split, y_train_split, sample_weight=sample_weight_split
        ).predict(X_test_split)
        self.assertIsInstance(y_pred_split_weight, ht.DNDarray)
        self.assert_array_equal(y_pred_split_weight, y_pred_split.numpy())

        # test exceptions
        X_torch = torch.ones(75, 4)
        y_np = np.zeros(75)
        y_2D = ht.ones((75, 1), split=None, device=ht_device)
        weights_torch = torch.zeros(75)
        X_3D = ht.ones((75, 4, 4), split=None, device=ht_device)
        X_wrong_size = ht.ones((75, 5), split=None, device=ht_device)
        y_wrong_size = ht.zeros(76, device=ht_device)
        X_train_split = ht.resplit(X_train, axis=0)
        y_train_split = ht.resplit(y_train, axis=0)
        weights_2D_split = y_2D = ht.ones((75, 1), split=0, device=ht_device)
        weights_wrong_size = ht.ones(76, device=ht_device)
        priors_wrong_shape = ht.random.randn(4, device=ht_device)
        priors_wrong_sum = ht.random.randn(3, dtype=ht.float32, device=ht_device)
        priors_wrong_sign = ht.array([-0.3, 0.7, 0.6])
        wrong_classes = ht.array([3, 4, 5])

        with self.assertRaises(ValueError):
            gnb_heat.fit(X_torch, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_np)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_2D)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train, sample_weight=weights_torch)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_3D, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_wrong_size)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train)
            gnb_heat.predict(X_torch)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train)
            gnb_heat.partial_fit(X_wrong_size, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train)
            gnb_heat.partial_fit(X_train, y_train, classes=wrong_classes)
        with self.assertRaises(ValueError):
            gnb_heat.classes_ = None
            gnb_heat.partial_fit(X_train, y_train, classes=None)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train_split, y_train_split, sample_weight=weights_2D_split)
        with self.assertRaises(ValueError):
            gnb_heat.fit(X_train, y_train, sample_weight=weights_wrong_size)
        with self.assertRaises(ValueError):
            gnb_heat.priors = priors_wrong_shape
            gnb_heat.fit(X_train, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.priors = priors_wrong_sum
            gnb_heat.fit(X_train, y_train)
        with self.assertRaises(ValueError):
            gnb_heat.priors = priors_wrong_sign
            gnb_heat.fit(X_train, y_train)
Beispiel #4
0
    def __partial_fit(
        self,
        x: DNDarray,
        y: DNDarray,
        classes: Optional[DNDarray] = None,
        _refit: bool = False,
        sample_weight: Optional[DNDarray] = None,
    ):
        """
        Actual implementation of Gaussian NB fitting. Adapted to HeAT from scikit-learn.

        Parameters
        ----------
        x : DNDarray
            Training set, where n_samples is the number of samples and
            n_features is the number of features. Shape = (n_samples, n_features)
        y : DNDarray
            Labels for training set. Shape = (n_samples,)
        classes : DNDarray, optional
            List of all the classes that can possibly appear in the y vector.
            Must be provided at the first call to :func:`partial_fit`, can be omitted
            in subsequent calls. Shape = (n_classes,)
        _refit : bool, optional
            If ``True``, act as though this were the first time :func:`__partial_fit` is called
            (ie, throw away any past fitting and start over).
        sample_weight : DNDarray, optional
            Weights applied to individual samples (1. for unweighted). Shape = (n_samples,)
        """
        # TODO: sanitize x and y shape: sanitation/validation module, cf. #468
        n_samples = x.shape[0]
        if x.ndim != 2:
            raise ValueError("expected x to be a 2-D tensor, is {}-D".format(
                x.ndim))
        if y.shape[0] != n_samples:
            raise ValueError(
                "y.shape[0] must match number of samples {}, is {}".format(
                    n_samples, y.shape[0]))

        # TODO: sanitize sample_weight: sanitation/validation module, cf. #468
        if sample_weight is not None:
            if sample_weight.ndim != 1:
                raise ValueError("Sample weights must be 1D tensor")
            if sample_weight.shape != (n_samples, ):
                raise ValueError(
                    "sample_weight.shape == {}, expected {}!".format(
                        sample_weight.shape, (n_samples, )))

        # If the ratio of data variance between dimensions is too small, it
        # will cause numerical errors. To address this, we artificially
        # boost the variance by epsilon, a small fraction of the standard
        # deviation of the largest dimension.
        self.epsilon_ = self.var_smoothing * ht.var(x, axis=0).max()

        if _refit:
            self.classes_ = None

        if self.__check_partial_fit_first_call(classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_features = x.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = ht.zeros((n_classes, n_features),
                                   dtype=x.dtype,
                                   device=x.device)
            self.sigma_ = ht.zeros((n_classes, n_features),
                                   dtype=x.dtype,
                                   device=x.device)

            self.class_count_ = ht.zeros((x.comm.size, n_classes),
                                         dtype=ht.float64,
                                         device=x.device,
                                         split=0)
            # Initialise the class prior
            # Take into account the priors
            if self.priors is not None:
                if not isinstance(self.priors, ht.DNDarray):
                    priors = ht.array(self.priors,
                                      dtype=x.dtype,
                                      split=None,
                                      device=x.device)
                else:
                    priors = self.priors
                # Check that the provide prior match the number of classes
                if len(priors) != n_classes:
                    raise ValueError("Number of priors must match number of"
                                     " classes.")
                # Check that the sum is 1
                if not ht.isclose(priors.sum(),
                                  ht.array(1.0, dtype=priors.dtype)):
                    raise ValueError("The sum of the priors should be 1.")
                # Check that the prior are non-negative
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = ht.zeros(len(self.classes_),
                                             dtype=ht.float64,
                                             split=None,
                                             device=x.device)
        else:
            if x.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features {} does not match previous data {}.".
                    format(x.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = ht.unique(y, sorted=True).resplit_(None)
        unique_y_in_classes = ht.eq(unique_y, classes)

        if not ht.all(unique_y_in_classes):
            raise ValueError("The target label(s) {} in y do not exist in the "
                             "initial classes {}".format(
                                 unique_y[~unique_y_in_classes], classes))
        # from now on: extract torch tensors for local operations
        # DNDarrays for distributed operations only
        for y_i in unique_y.larray:
            # assuming classes.split is None
            if y_i in classes.larray:
                i = torch.where(classes.larray == y_i)[0].item()
            else:
                classes_ext = torch.cat(
                    (classes.larray, y_i.larray.unsqueeze(0)))
                i = torch.argsort(classes_ext)[-1].item()
            where_y_i = torch.where(y.larray == y_i)[0]
            X_i = x[where_y_i, :]

            if sample_weight is not None:
                sw_i = sample_weight[where_y_i]
                if 0 not in sw_i.shape:
                    N_i = sw_i.sum().item()
                else:
                    N_i = 0.0
                    sw_i = None
            else:
                sw_i = None
                N_i = X_i.shape[0]

            new_theta, new_sigma = self.__update_mean_variance(
                self.class_count_.larray[:, i].item(),
                self.theta_[i, :],
                self.sigma_[i, :],
                X_i,
                sw_i,
            )
            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_.larray[:, i] += N_i

        self.sigma_[:, :] += self.epsilon_

        # Update only if no priors are provided
        if self.priors is None:
            # distributed class_count_: sum along distribution axis
            self.class_count_ = self.class_count_.sum(axis=0, keepdim=True)
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = (self.class_count_ /
                                 self.class_count_.sum()).squeeze(0)

        return self