def test_where(self): # cases to test # no x and y a = ht.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], split=None) cond = a > 3 wh = ht.where(cond) self.assertEqual(wh.gshape, (6, 2)) self.assertEqual(wh.dtype, ht.int64) self.assertEqual(wh.split, None) # split a = ht.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], split=1) cond = a > 3 wh = ht.where(cond) self.assertEqual(wh.gshape, (6, 2)) self.assertEqual(wh.dtype, ht.int64) self.assertEqual(wh.split, 0) # not split cond a = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [0.0, 3.0, 6.0]], split=None) res = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, -1.0], [0.0, 3.0, -1.0]], split=None) wh = ht.where(a < 4.0, a, -1) self.assertTrue( ht.equal(a[ht.nonzero(a < 4)], ht.array([0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 3.0]))) self.assertTrue(ht.equal(wh, res)) self.assertEqual(wh.gshape, (3, 3)) self.assertEqual(wh.dtype, ht.float64) # split cond a = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [0.0, 3.0, 6.0]], split=0) res = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, -1.0], [0.0, 3.0, -1.0]], split=0) wh = ht.where(a < 4.0, a, -1) self.assertTrue(ht.all(wh[ht.nonzero(a >= 4)] == -1)) self.assertTrue(ht.equal(wh, res)) self.assertEqual(wh.gshape, (3, 3)) self.assertEqual(wh.dtype, ht.float64) self.assertEqual(wh.split, 0) a = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [0.0, 3.0, 6.0]], split=1) res = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, -1.0], [0.0, 3.0, -1.0]], split=1) wh = ht.where(a < 4.0, a, -1.0) self.assertTrue(ht.equal(wh, res)) self.assertEqual(wh.gshape, (3, 3)) self.assertEqual(wh.dtype, ht.float) self.assertEqual(wh.split, 1) with self.assertRaises(TypeError): ht.where(cond, a) with self.assertRaises(NotImplementedError): ht.where(cond, ht.ones((3, 3), split=0), ht.ones((3, 3), split=1))
def calculate_accuracy(new_y, verification_y): """ Calculates the accuracy of classification/clustering-algorithms. Note this only works with integer/discrete classes. For algorithms that give approximations an error function is required. Parameters ---------- new_y : ht.tensor of shape (n_samples, n_features), required The new labels that where generated verification_y : ht.tensor of shape (n_samples, n_features), required Known labels Returns ---------- float the accuracy, number of properly labeled samples divided by amount of labels. """ if new_y.gshape != verification_y.gshape: raise ValueError("Expecting results of same length, got {}, {}".format( new_y.gshape, verification_y.gshape)) count = ht.sum(ht.where(new_y == verification_y, 1, 0)) return count / new_y.gshape[0]
def label_to_one_hot(a): max_label = ht.max(a) a = a.expand_dims(1) items = ht.arange(0, max_label.item() + 1) one_hot = ht.stack([items for i in range(a.shape[0])], axis=0) one_hot = ht.where(one_hot == a, 1, 0) return one_hot
def construct(self, X): S = self.similarity_metric(X) S.fill_diagonal(0.0) if self.mode == "eNeighbour": if self.epsilon[0] == "upper": if self.weighted: S = ht.where(S < self.epsilon[1], S, 0) else: S = ht.int(S < self.epsilon[1]) else: if self.weighted: S = ht.where(S > self.epsilon[1], S, 0) else: S = ht.int(S > self.epsilon[1]) if self.definition == "simple": L = self._simple_L(S) elif self.definition == "norm_sym": L = self._normalized_symmetric_L(S) return L
def fit(self, X): """ Computes the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of the graph laplacian from the similarity matrix and fits the eigenvectors that correspond to the k lowest eigenvalues with a seperate clustering algorithm (currently only kmeans is supported). Similarity metrics for adjacency calculations are supported via spatial.distance. The eigenvalues and eigenvectors are computed by reducing the Laplacian via lanczos iterations and using the torch eigenvalue solver on this smaller matrix. If other eigenvalue decompostion methods are supported, this will be expanded. Parameters ---------- X : ht.DNDarray, shape=(n_samples, n_features) Training instances to cluster. """ # 1. input sanitation if not isinstance(X, ht.DNDarray): raise ValueError( "input needs to be a ht.DNDarray, but was {}".format(type(X))) if X.split is not None and X.split != 0: raise NotImplementedError( "Not implemented for other splitting-axes") # 2. Embed Dataset into lower-dimensional Eigenvector space eigenvalues, eigenvectors = self._spectral_embedding(X) # 3. Find the spectral gap, if number of clusters is not defined from the outside if self.n_clusters is None: diff = eigenvalues[1:] - eigenvalues[:-1] tmp = ht.where(diff == diff.max()).item() self.n_clusters = tmp + 1 components = eigenvectors[:, :self.n_clusters].copy() params = self._cluster.get_params() params["n_clusters"] = self.n_clusters self._cluster.set_params(**params) self._cluster.fit(components) self._labels = self._cluster.labels_ self._cluster_centers = self._cluster.cluster_centers_ return self
def __partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): """ Actual implementation of Gaussian NB fitting. Adapted to HeAT from scikit-learn. Parameters ---------- X : ht.tensor of shape (n_samples, n_features) Training set, where n_samples is the number of samples and n_features is the number of features. y : ht.tensor of shape (n_samples,) Labels for training set. classes : ht.tensor of shape (n_classes,), optional (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. _refit : bool, optional (default=False) If true, act as though this were the first time __partial_fit is called (ie, throw away any past fitting and start over). sample_weight : ht.tensor of shape (n_samples,), optional (default=None) Weights applied to individual samples (1. for unweighted). Returns ------- self : object """ # TODO: sanitize X and y shape: sanitation/validation module, cf. #468 n_samples = X.shape[0] if X.numdims != 2: raise ValueError("expected X to be a 2-D tensor, is {}-D".format( X.numdims)) if y.shape[0] != n_samples: raise ValueError( "y.shape[0] must match number of samples {}, is {}".format( n_samples, y.shape[0])) # TODO: sanitize sample_weight: sanitation/validation module, cf. #468 if sample_weight is not None: if sample_weight.numdims != 1: raise ValueError("Sample weights must be 1D tensor") if sample_weight.shape != (n_samples, ): raise ValueError( "sample_weight.shape == {}, expected {}!".format( sample_weight.shape, (n_samples, ))) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. self.epsilon_ = self.var_smoothing * ht.var(X, axis=0).max() if _refit: self.classes_ = None if self.__check_partial_fit_first_call(classes): # This is the first call to partial_fit: # initialize various cumulative counters n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = ht.zeros((n_classes, n_features), dtype=X.dtype, device=X.device) self.sigma_ = ht.zeros((n_classes, n_features), dtype=X.dtype, device=X.device) self.class_count_ = ht.zeros((n_classes, ), dtype=ht.float64, device=X.device) # Initialise the class prior # Take into account the priors if self.priors is not None: if not isinstance(self.priors, ht.DNDarray): priors = ht.array(self.priors, dtype=X.dtype, split=None, device=X.device) else: priors = self.priors # Check that the provide prior match the number of classes if len(priors) != n_classes: raise ValueError("Number of priors must match number of" " classes.") # Check that the sum is 1 if not ht.isclose(priors.sum(), ht.array(1.0, dtype=priors.dtype)): raise ValueError("The sum of the priors should be 1.") # Check that the prior are non-negative if (priors < 0).any(): raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = ht.zeros(len(self.classes_), dtype=ht.float64, split=None, device=X.device) else: if X.shape[1] != self.theta_.shape[1]: raise ValueError( "Number of features {} does not match previous data {}.". format(X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= self.epsilon_ classes = self.classes_ unique_y = ht.unique(y, sorted=True) if unique_y.split is not None: unique_y = ht.resplit(unique_y, axis=None) unique_y_in_classes = ht.eq(unique_y, classes) if not ht.all(unique_y_in_classes): raise ValueError("The target label(s) {} in y do not exist in the " "initial classes {}".format( unique_y[~unique_y_in_classes], classes)) for y_i in unique_y: # assuming classes.split is None if y_i in classes: i = ht.where(classes == y_i).item() else: classes_ext = torch.cat((classes._DNDarray__array, y_i._DNDarray__array.unsqueeze(0))) i = torch.argsort(classes_ext)[-1].item() where_y_i = ht.where(y == y_i)._DNDarray__array.tolist() X_i = X[where_y_i, :] if sample_weight is not None: sw_i = sample_weight[where_y_i] if 0 not in sw_i.shape: N_i = sw_i.sum() else: N_i = 0.0 sw_i = None else: sw_i = None N_i = X_i.shape[0] new_theta, new_sigma = self.__update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, sw_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += N_i self.sigma_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() return self