Ejemplo n.º 1
0
    def test_where(self):
        # cases to test
        # no x and y
        a = ht.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], split=None)
        cond = a > 3
        wh = ht.where(cond)
        self.assertEqual(wh.gshape, (6, 2))
        self.assertEqual(wh.dtype, ht.int64)
        self.assertEqual(wh.split, None)
        # split
        a = ht.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], split=1)
        cond = a > 3
        wh = ht.where(cond)
        self.assertEqual(wh.gshape, (6, 2))
        self.assertEqual(wh.dtype, ht.int64)
        self.assertEqual(wh.split, 0)

        # not split cond
        a = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [0.0, 3.0, 6.0]],
                     split=None)
        res = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, -1.0], [0.0, 3.0, -1.0]],
                       split=None)
        wh = ht.where(a < 4.0, a, -1)
        self.assertTrue(
            ht.equal(a[ht.nonzero(a < 4)],
                     ht.array([0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 3.0])))
        self.assertTrue(ht.equal(wh, res))
        self.assertEqual(wh.gshape, (3, 3))
        self.assertEqual(wh.dtype, ht.float64)

        # split cond
        a = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [0.0, 3.0, 6.0]],
                     split=0)
        res = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, -1.0], [0.0, 3.0, -1.0]],
                       split=0)
        wh = ht.where(a < 4.0, a, -1)
        self.assertTrue(ht.all(wh[ht.nonzero(a >= 4)] == -1))
        self.assertTrue(ht.equal(wh, res))
        self.assertEqual(wh.gshape, (3, 3))
        self.assertEqual(wh.dtype, ht.float64)
        self.assertEqual(wh.split, 0)

        a = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [0.0, 3.0, 6.0]],
                     split=1)
        res = ht.array([[0.0, 1.0, 2.0], [0.0, 2.0, -1.0], [0.0, 3.0, -1.0]],
                       split=1)
        wh = ht.where(a < 4.0, a, -1.0)
        self.assertTrue(ht.equal(wh, res))
        self.assertEqual(wh.gshape, (3, 3))
        self.assertEqual(wh.dtype, ht.float)
        self.assertEqual(wh.split, 1)

        with self.assertRaises(TypeError):
            ht.where(cond, a)

        with self.assertRaises(NotImplementedError):
            ht.where(cond, ht.ones((3, 3), split=0), ht.ones((3, 3), split=1))
Ejemplo n.º 2
0
def calculate_accuracy(new_y, verification_y):
    """
    Calculates the accuracy of classification/clustering-algorithms.
    Note this only works with integer/discrete classes. For algorithms that give approximations an error function is
    required.

    Parameters
    ----------
    new_y : ht.tensor of shape (n_samples, n_features), required
        The new labels that where generated
    verification_y : ht.tensor of shape (n_samples, n_features), required
        Known labels

    Returns
    ----------
    float
        the accuracy, number of properly labeled samples divided by amount of labels.
    """

    if new_y.gshape != verification_y.gshape:
        raise ValueError("Expecting results of same length, got {}, {}".format(
            new_y.gshape, verification_y.gshape))

    count = ht.sum(ht.where(new_y == verification_y, 1, 0))

    return count / new_y.gshape[0]
Ejemplo n.º 3
0
    def label_to_one_hot(a):
        max_label = ht.max(a)
        a = a.expand_dims(1)

        items = ht.arange(0, max_label.item() + 1)
        one_hot = ht.stack([items for i in range(a.shape[0])], axis=0)
        one_hot = ht.where(one_hot == a, 1, 0)

        return one_hot
Ejemplo n.º 4
0
    def construct(self, X):
        S = self.similarity_metric(X)
        S.fill_diagonal(0.0)

        if self.mode == "eNeighbour":
            if self.epsilon[0] == "upper":
                if self.weighted:
                    S = ht.where(S < self.epsilon[1], S, 0)
                else:
                    S = ht.int(S < self.epsilon[1])
            else:
                if self.weighted:
                    S = ht.where(S > self.epsilon[1], S, 0)
                else:
                    S = ht.int(S > self.epsilon[1])

        if self.definition == "simple":
            L = self._simple_L(S)
        elif self.definition == "norm_sym":
            L = self._normalized_symmetric_L(S)

        return L
Ejemplo n.º 5
0
    def fit(self, X):
        """
        Computes the low-dim representation by calculation of eigenspectrum (eigenvalues and eigenvectors) of the graph
        laplacian from the similarity matrix and fits the eigenvectors that correspond to the k lowest eigenvalues with
        a seperate clustering algorithm (currently only kmeans is supported). Similarity metrics for adjacency
        calculations are supported via spatial.distance. The eigenvalues and eigenvectors are computed by reducing the
        Laplacian via lanczos iterations and using the torch eigenvalue solver on this smaller matrix. If other
        eigenvalue decompostion methods are supported, this will be expanded.

        Parameters
        ----------
        X : ht.DNDarray, shape=(n_samples, n_features)
            Training instances to cluster.
        """
        # 1. input sanitation
        if not isinstance(X, ht.DNDarray):
            raise ValueError(
                "input needs to be a ht.DNDarray, but was {}".format(type(X)))
        if X.split is not None and X.split != 0:
            raise NotImplementedError(
                "Not implemented for other splitting-axes")
        # 2. Embed Dataset into lower-dimensional Eigenvector space
        eigenvalues, eigenvectors = self._spectral_embedding(X)

        # 3. Find the spectral gap, if number of clusters is not defined from the outside
        if self.n_clusters is None:
            diff = eigenvalues[1:] - eigenvalues[:-1]
            tmp = ht.where(diff == diff.max()).item()
            self.n_clusters = tmp + 1

        components = eigenvectors[:, :self.n_clusters].copy()

        params = self._cluster.get_params()
        params["n_clusters"] = self.n_clusters
        self._cluster.set_params(**params)
        self._cluster.fit(components)
        self._labels = self._cluster.labels_
        self._cluster_centers = self._cluster.cluster_centers_

        return self
Ejemplo n.º 6
0
    def __partial_fit(self,
                      X,
                      y,
                      classes=None,
                      _refit=False,
                      sample_weight=None):
        """
        Actual implementation of Gaussian NB fitting. Adapted to HeAT from scikit-learn.

        Parameters
        ----------
        X : ht.tensor of shape (n_samples, n_features)
            Training set, where n_samples is the number of samples and
            n_features is the number of features.
        y : ht.tensor of shape (n_samples,)
            Labels for training set.
        classes : ht.tensor of shape (n_classes,), optional (default=None)
            List of all the classes that can possibly appear in the y vector.
            Must be provided at the first call to partial_fit, can be omitted
            in subsequent calls.
        _refit : bool, optional (default=False)
            If true, act as though this were the first time __partial_fit is called
            (ie, throw away any past fitting and start over).
        sample_weight : ht.tensor of shape (n_samples,), optional (default=None)
            Weights applied to individual samples (1. for unweighted).

        Returns
        -------
        self : object
        """

        # TODO: sanitize X and y shape: sanitation/validation module, cf. #468
        n_samples = X.shape[0]
        if X.numdims != 2:
            raise ValueError("expected X to be a 2-D tensor, is {}-D".format(
                X.numdims))
        if y.shape[0] != n_samples:
            raise ValueError(
                "y.shape[0] must match number of samples {}, is {}".format(
                    n_samples, y.shape[0]))

        # TODO: sanitize sample_weight: sanitation/validation module, cf. #468
        if sample_weight is not None:
            if sample_weight.numdims != 1:
                raise ValueError("Sample weights must be 1D tensor")
            if sample_weight.shape != (n_samples, ):
                raise ValueError(
                    "sample_weight.shape == {}, expected {}!".format(
                        sample_weight.shape, (n_samples, )))

        # If the ratio of data variance between dimensions is too small, it
        # will cause numerical errors. To address this, we artificially
        # boost the variance by epsilon, a small fraction of the standard
        # deviation of the largest dimension.
        self.epsilon_ = self.var_smoothing * ht.var(X, axis=0).max()

        if _refit:
            self.classes_ = None

        if self.__check_partial_fit_first_call(classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = ht.zeros((n_classes, n_features),
                                   dtype=X.dtype,
                                   device=X.device)
            self.sigma_ = ht.zeros((n_classes, n_features),
                                   dtype=X.dtype,
                                   device=X.device)

            self.class_count_ = ht.zeros((n_classes, ),
                                         dtype=ht.float64,
                                         device=X.device)

            # Initialise the class prior
            # Take into account the priors
            if self.priors is not None:
                if not isinstance(self.priors, ht.DNDarray):
                    priors = ht.array(self.priors,
                                      dtype=X.dtype,
                                      split=None,
                                      device=X.device)
                else:
                    priors = self.priors
                # Check that the provide prior match the number of classes
                if len(priors) != n_classes:
                    raise ValueError("Number of priors must match number of"
                                     " classes.")
                # Check that the sum is 1
                if not ht.isclose(priors.sum(),
                                  ht.array(1.0, dtype=priors.dtype)):
                    raise ValueError("The sum of the priors should be 1.")
                # Check that the prior are non-negative
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = ht.zeros(len(self.classes_),
                                             dtype=ht.float64,
                                             split=None,
                                             device=X.device)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features {} does not match previous data {}.".
                    format(X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = ht.unique(y, sorted=True)
        if unique_y.split is not None:
            unique_y = ht.resplit(unique_y, axis=None)
        unique_y_in_classes = ht.eq(unique_y, classes)

        if not ht.all(unique_y_in_classes):
            raise ValueError("The target label(s) {} in y do not exist in the "
                             "initial classes {}".format(
                                 unique_y[~unique_y_in_classes], classes))
        for y_i in unique_y:
            # assuming classes.split is None
            if y_i in classes:
                i = ht.where(classes == y_i).item()
            else:
                classes_ext = torch.cat((classes._DNDarray__array,
                                         y_i._DNDarray__array.unsqueeze(0)))
                i = torch.argsort(classes_ext)[-1].item()
            where_y_i = ht.where(y == y_i)._DNDarray__array.tolist()
            X_i = X[where_y_i, :]

            if sample_weight is not None:
                sw_i = sample_weight[where_y_i]
                if 0 not in sw_i.shape:
                    N_i = sw_i.sum()
                else:
                    N_i = 0.0
                    sw_i = None
            else:
                sw_i = None
                N_i = X_i.shape[0]

            new_theta, new_sigma = self.__update_mean_variance(
                self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
                X_i, sw_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += N_i

        self.sigma_[:, :] += self.epsilon_

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        return self