def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): if sample_weight is not None: warn_unused_args("sample_weight") # Store size of current X to apply differential privacy later on self.new_n_samples = X.shape[0] if self.bounds is None: warnings.warn( "Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0))) self.bounds = _check_bounds(self.bounds, X.shape[1]) super()._partial_fit(X, y, classes, _refit, sample_weight=None) del self.new_n_samples return self
def fit(self, X, y=None, sample_weight=None): """Computes k-means clustering with differential privacy. Parameters ---------- X : array-like, shape=(n_samples, n_features) Training instances to cluster. y : Ignored not used, present here for API consistency by convention. sample_weight : Ignored Not used in diffprivlib, present here for consistency with :obj:`sklearn.cluster.KMeans`. Specifying this parameter will result in a :class:`.DiffprivlibCompatibilityWarning`. Returns ------- self : class """ if sample_weight is not None: warn_unused_args("sample_weight") del y if X.ndim != 2: raise ValueError( "Expected 2D array, got array with %d dimensions instead. Reshape your data using array.reshape(-1, 1)," "or array.reshape(1, -1) if your data contains only one sample." % X.ndim) n_samples, n_dims = X.shape iters = self._calc_iters(n_dims, n_samples) if self.bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) self.bounds = list(zip(np.min(X, axis=0), np.max(X, axis=0))) self.bounds = _check_bounds(self.bounds, n_dims) centers = self._init_centers(n_dims) labels = None distances = None # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely for _ in range(-1, iters): if labels is not None: centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters) distances, labels = self._distances_labels(X, centers) self.cluster_centers_ = centers self.labels_ = labels self.inertia_ = distances[np.arange(len(labels)), labels].sum() self.n_iter_ = iters return self