def __partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): """ Actual implementation of Gaussian NB fitting. Adapted to HeAT from scikit-learn. Parameters ---------- X : ht.tensor of shape (n_samples, n_features) Training set, where n_samples is the number of samples and n_features is the number of features. y : ht.tensor of shape (n_samples,) Labels for training set. classes : ht.tensor of shape (n_classes,), optional (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. _refit : bool, optional (default=False) If true, act as though this were the first time __partial_fit is called (ie, throw away any past fitting and start over). sample_weight : ht.tensor of shape (n_samples,), optional (default=None) Weights applied to individual samples (1. for unweighted). Returns ------- self : object """ # TODO: sanitize X and y shape: sanitation/validation module, cf. #468 n_samples = X.shape[0] if X.numdims != 2: raise ValueError("expected X to be a 2-D tensor, is {}-D".format( X.numdims)) if y.shape[0] != n_samples: raise ValueError( "y.shape[0] must match number of samples {}, is {}".format( n_samples, y.shape[0])) # TODO: sanitize sample_weight: sanitation/validation module, cf. #468 if sample_weight is not None: if sample_weight.numdims != 1: raise ValueError("Sample weights must be 1D tensor") if sample_weight.shape != (n_samples, ): raise ValueError( "sample_weight.shape == {}, expected {}!".format( sample_weight.shape, (n_samples, ))) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. self.epsilon_ = self.var_smoothing * ht.var(X, axis=0).max() if _refit: self.classes_ = None if self.__check_partial_fit_first_call(classes): # This is the first call to partial_fit: # initialize various cumulative counters n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = ht.zeros((n_classes, n_features), dtype=X.dtype, device=X.device) self.sigma_ = ht.zeros((n_classes, n_features), dtype=X.dtype, device=X.device) self.class_count_ = ht.zeros((n_classes, ), dtype=ht.float64, device=X.device) # Initialise the class prior # Take into account the priors if self.priors is not None: if not isinstance(self.priors, ht.DNDarray): priors = ht.array(self.priors, dtype=X.dtype, split=None, device=X.device) else: priors = self.priors # Check that the provide prior match the number of classes if len(priors) != n_classes: raise ValueError("Number of priors must match number of" " classes.") # Check that the sum is 1 if not ht.isclose(priors.sum(), ht.array(1.0, dtype=priors.dtype)): raise ValueError("The sum of the priors should be 1.") # Check that the prior are non-negative if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = ht.zeros(len(self.classes_), dtype=ht.float64, split=None, device=X.device) else: if X.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features {} does not match previous data {}.". format(X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = ht.unique(y, sorted=True) if unique_y.split is not None: unique_y = ht.resplit(unique_y, axis=None) unique_y_in_classes = ht.eq(unique_y, classes) if not ht.all(unique_y_in_classes): raise ValueError("The target label(s) {} in y do not exist in the " "initial classes {}".format( unique_y[~unique_y_in_classes], classes)) for y_i in unique_y: # assuming classes.split is None if y_i in classes: i = ht.where(classes == y_i).item() else: classes_ext = torch.cat((classes._DNDarray__array, y_i._DNDarray__array.unsqueeze(0))) i = torch.argsort(classes_ext)[-1].item() where_y_i = ht.where(y == y_i)._DNDarray__array.tolist() X_i = X[where_y_i, :] if sample_weight is not None: sw_i = sample_weight[where_y_i] if 0 not in sw_i.shape: N_i = sw_i.sum() else: N_i = 0.0 sw_i = None else: sw_i = None N_i = X_i.shape[0] new_theta, new_sigma = self.__update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, sw_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += N_i self.sigma_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() return self
def __update_mean_variance(n_past, mu, var, X, sample_weight=None): """ Adapted to HeAT from scikit-learn. Compute online update of Gaussian mean and variance. Given starting sample count, mean, and variance, a new set of points X, and optionally sample weights, return the updated mean and variance. (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance). Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of independent Gaussians. See Chan, Golub, and LeVeque 1983 [1] Parameters ---------- n_past : int Number of samples represented in old mean and variance. If sample weights were given, this should contain the sum of sample weights represented in old mean and variance. mu : ht.tensor of shape (number of Gaussians,) Means for Gaussians in original set. var : ht.tensor of shape (number of Gaussians,) Variances for Gaussians in original set. sample_weight : ht.tensor of shape (n_samples,), optional (default=None) Weights applied to individual samples (1. for unweighted). Returns ------- total_mu : ht.tensor of shape (number of Gaussians,) Updated mean for each Gaussian over the combined set. total_var : ht.tensor of shape (number of Gaussians,) Updated variance for each Gaussian over the combined set. References ---------- [1] Chan, Tony F., Golub, Gene H., and Leveque, Randall J., "Algorithms for Computing the Sample Variance: Analysis and Recommendations", The American Statistician, 37:3, pp. 242-247, 1983 """ if X.shape[0] == 0: return mu, var # Compute (potentially weighted) mean and variance of new datapoints # TODO:Issue #351 allow weighted average across multiple axes if sample_weight is not None: n_new = float(sample_weight.sum()) new_mu = ht.average(X, axis=0, weights=sample_weight) new_var = ht.average((X - new_mu)**2, axis=0, weights=sample_weight) else: n_new = X.shape[0] new_var = ht.var(X, axis=0) new_mu = ht.mean(X, axis=0) if n_past == 0: return new_mu, new_var n_total = float(n_past + n_new) # Combine mean of old and new data, taking into consideration # (weighted) number of observations total_mu = (n_new * new_mu + n_past * mu) / n_total # Combine variance of old and new data, taking into consideration # (weighted) number of observations. This is achieved by combining # the sum-of-squared-differences (ssd) old_ssd = n_past * var new_ssd = n_new * new_var total_ssd = old_ssd + new_ssd + (n_new * n_past / n_total) * (mu - new_mu)**2 total_var = total_ssd / n_total return total_mu, total_var
def test_var(self): array_0_len = ht.MPI_WORLD.size * 2 array_1_len = ht.MPI_WORLD.size * 2 array_2_len = ht.MPI_WORLD.size * 2 # test raises x = ht.zeros((2, 3, 4)) with self.assertRaises(ValueError): x.var(axis=10) with self.assertRaises(ValueError): x.var(axis=[4]) with self.assertRaises(ValueError): x.var(axis=[-4]) with self.assertRaises(TypeError): ht.var(x, axis="01") with self.assertRaises(ValueError): ht.var(x, axis=(0, "10")) with self.assertRaises(ValueError): ht.var(x, axis=(0, 0)) with self.assertRaises(NotImplementedError): ht.var(x, ddof=2) with self.assertRaises(ValueError): ht.var(x, ddof=-2) with self.assertRaises(ValueError): ht.mean(x, axis=torch.Tensor([0, 0])) a = ht.arange(1, 5) self.assertEqual(a.var(ddof=1), 1.666666666666666) # ones dimensions = [] for d in [array_0_len, array_1_len, array_2_len]: dimensions.extend([d]) hold = list(range(len(dimensions))) hold.append(None) for split in hold: # loop over the number of dimensions of the test array z = ht.ones(dimensions, split=split) res = z.var(ddof=0) total_dims_list = list(z.shape) self.assertTrue((res == 0).all()) # loop over the different single dimensions for var for it in range(len(z.shape)): res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q != it ] if not target_dims: target_dims = () self.assertEqual(res.gshape, tuple(target_dims)) if z.split is None: sp = None else: sp = z.split if it > z.split else z.split - 1 if it == split: sp = None self.assertEqual(res.split, sp) if split == it: res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) loop_list = [ ",".join(map(str, comb)) for comb in combinations(list(range(len(z.shape))), 2) ] for it in loop_list: # loop over the different combinations of dimensions for var lp_split = [int(q) for q in it.split(",")] res = z.var(axis=lp_split) self.assertTrue((res == 0).all()) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q not in lp_split ] if not target_dims: target_dims = (1, ) if res.gshape: self.assertEqual(res.gshape, tuple(target_dims)) if res.split is not None: if any([split >= x for x in lp_split]): self.assertEqual(res.split, len(target_dims) - 1) else: self.assertEqual(res.split, z.split) # values for the iris dataset var measured by libreoffice calc for sp in [None, 0, 1]: iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp) self.assertTrue( ht.allclose(ht.var(iris, bessel=True), 3.90318519755147))
def __partial_fit( self, x: DNDarray, y: DNDarray, classes: Optional[DNDarray] = None, _refit: bool = False, sample_weight: Optional[DNDarray] = None, ): """ Actual implementation of Gaussian NB fitting. Adapted to HeAT from scikit-learn. Parameters ---------- x : DNDarray Training set, where n_samples is the number of samples and n_features is the number of features. Shape = (n_samples, n_features) y : DNDarray Labels for training set. Shape = (n_samples,) classes : DNDarray, optional List of all the classes that can possibly appear in the y vector. Must be provided at the first call to :func:`partial_fit`, can be omitted in subsequent calls. Shape = (n_classes,) _refit : bool, optional If ``True``, act as though this were the first time :func:`__partial_fit` is called (ie, throw away any past fitting and start over). sample_weight : DNDarray, optional Weights applied to individual samples (1. for unweighted). Shape = (n_samples,) """ # TODO: sanitize x and y shape: sanitation/validation module, cf. #468 n_samples = x.shape[0] if x.ndim != 2: raise ValueError("expected x to be a 2-D tensor, is {}-D".format( x.ndim)) if y.shape[0] != n_samples: raise ValueError( "y.shape[0] must match number of samples {}, is {}".format( n_samples, y.shape[0])) # TODO: sanitize sample_weight: sanitation/validation module, cf. #468 if sample_weight is not None: if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D tensor") if sample_weight.shape != (n_samples, ): raise ValueError( "sample_weight.shape == {}, expected {}!".format( sample_weight.shape, (n_samples, ))) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. self.epsilon_ = self.var_smoothing * ht.var(x, axis=0).max() if _refit: self.classes_ = None if self.__check_partial_fit_first_call(classes): # This is the first call to partial_fit: # initialize various cumulative counters n_features = x.shape[1] n_classes = len(self.classes_) self.theta_ = ht.zeros((n_classes, n_features), dtype=x.dtype, device=x.device) self.sigma_ = ht.zeros((n_classes, n_features), dtype=x.dtype, device=x.device) self.class_count_ = ht.zeros((x.comm.size, n_classes), dtype=ht.float64, device=x.device, split=0) # Initialise the class prior # Take into account the priors if self.priors is not None: if not isinstance(self.priors, ht.DNDarray): priors = ht.array(self.priors, dtype=x.dtype, split=None, device=x.device) else: priors = self.priors # Check that the provide prior match the number of classes if len(priors) != n_classes: raise ValueError("Number of priors must match number of" " classes.") # Check that the sum is 1 if not ht.isclose(priors.sum(), ht.array(1.0, dtype=priors.dtype)): raise ValueError("The sum of the priors should be 1.") # Check that the prior are non-negative if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = ht.zeros(len(self.classes_), dtype=ht.float64, split=None, device=x.device) else: if x.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features {} does not match previous data {}.". format(x.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = ht.unique(y, sorted=True).resplit_(None) unique_y_in_classes = ht.eq(unique_y, classes) if not ht.all(unique_y_in_classes): raise ValueError("The target label(s) {} in y do not exist in the " "initial classes {}".format( unique_y[~unique_y_in_classes], classes)) # from now on: extract torch tensors for local operations # DNDarrays for distributed operations only for y_i in unique_y.larray: # assuming classes.split is None if y_i in classes.larray: i = torch.where(classes.larray == y_i)[0].item() else: classes_ext = torch.cat( (classes.larray, y_i.larray.unsqueeze(0))) i = torch.argsort(classes_ext)[-1].item() where_y_i = torch.where(y.larray == y_i)[0] X_i = x[where_y_i, :] if sample_weight is not None: sw_i = sample_weight[where_y_i] if 0 not in sw_i.shape: N_i = sw_i.sum().item() else: N_i = 0.0 sw_i = None else: sw_i = None N_i = X_i.shape[0] new_theta, new_sigma = self.__update_mean_variance( self.class_count_.larray[:, i].item(), self.theta_[i, :], self.sigma_[i, :], X_i, sw_i, ) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_.larray[:, i] += N_i self.sigma_[:, :] += self.epsilon_ # Update only if no priors are provided if self.priors is None: # distributed class_count_: sum along distribution axis self.class_count_ = self.class_count_.sum(axis=0, keepdim=True) # Empirical prior, with sample_weight taken into account self.class_prior_ = (self.class_count_ / self.class_count_.sum()).squeeze(0) return self
def test_var(self): array_0_len = ht.MPI_WORLD.size * 2 array_1_len = ht.MPI_WORLD.size * 2 array_2_len = ht.MPI_WORLD.size * 2 # test raises x = ht.zeros((2, 3, 4), device=ht_device) with self.assertRaises(TypeError): ht.var(x, axis=0, bessel=1) with self.assertRaises(ValueError): ht.var(x, axis=10) with self.assertRaises(TypeError): ht.var(x, axis="01") a = ht.arange(1, 5, device=ht_device) self.assertEqual(a.var(), 1.666666666666666) # ones dimensions = [] for d in [array_0_len, array_1_len, array_2_len]: dimensions.extend([d]) hold = list(range(len(dimensions))) hold.append(None) for split in hold: # loop over the number of dimensions of the test array z = ht.ones(dimensions, split=split, device=ht_device) res = z.var() total_dims_list = list(z.shape) self.assertTrue((res == 0).all()) # loop over the different single dimensions for mean for it in range(len(z.shape)): res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) target_dims = [ total_dims_list[q] for q in range(len(total_dims_list)) if q != it ] if not target_dims: target_dims = () # print(split, it, z.shape, res.shape) self.assertEqual(res.gshape, tuple(target_dims)) # if res.split is not None: # if i >= it: # self.assertEqual(res.split, len(target_dims) - 1) # else: # self.assertEqual(res.split, z.split) if z.split is None: sp = None else: sp = z.split if it > z.split else z.split - 1 if it == split: sp = None self.assertEqual(res.split, sp) if split == it: res = z.var(axis=it) self.assertTrue(ht.allclose(res, 0)) z = ht.ones(dimensions, split=split, device=ht_device) res = z.var(bessel=False) self.assertTrue(ht.allclose(res, 0)) # values for the iris dataset var measured by libreoffice calc for sp in [None, 0, 1]: iris = ht.load("heat/datasets/data/iris.csv", sep=";", split=sp, device=ht_device) self.assertTrue( ht.allclose(ht.var(iris, bessel=True), 3.90318519755147))