Exemple #1
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        if sample_weight is not None:
            warn_unused_args("sample_weight")

        # Store size of current X to apply differential privacy later on
        self.new_n_samples = X.shape[0]

        if self.bounds is None:
            warnings.warn(
                "Bounds have not been specified and will be calculated on the data provided. This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify bounds for each dimension.",
                PrivacyLeakWarning)
            self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0)))

        self.bounds = _check_bounds(self.bounds, X.shape[1])

        super()._partial_fit(X, y, classes, _refit, sample_weight=None)

        del self.new_n_samples
        return self
    def fit(self, X, y=None, sample_weight=None):
        """Computes k-means clustering with differential privacy.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored
            not used, present here for API consistency by convention.

        sample_weight : Ignored
            Not used in diffprivlib, present here for consistency with :obj:`sklearn.cluster.KMeans`. Specifying this
            parameter will result in a :class:`.DiffprivlibCompatibilityWarning`.

        Returns
        -------
        self : class

        """
        if sample_weight is not None:
            warn_unused_args("sample_weight")

        del y

        if X.ndim != 2:
            raise ValueError(
                "Expected 2D array, got array with %d dimensions instead. Reshape your data using array.reshape(-1, 1),"
                "or array.reshape(1, -1) if your data contains only one sample." % X.ndim)

        n_samples, n_dims = X.shape

        iters = self._calc_iters(n_dims, n_samples)

        if self.bounds is None:
            warnings.warn("Bounds have not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning)
            self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0)))

        self.bounds = _check_bounds(self.bounds, n_dims)

        centers = self._init_centers(n_dims)
        labels = None
        distances = None

        # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely
        for _ in range(-1, iters):
            if labels is not None:
                centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters)

            distances, labels = self._distances_labels(X, centers)

        self.cluster_centers_ = centers
        self.labels_ = labels
        self.inertia_ = distances[np.arange(len(labels)), labels].sum()
        self.n_iter_ = iters

        return self