Beispiel #1
0
def _preprocess_data(X, y, fit_intercept, epsilon=1.0, range_X=None, range_y=None, copy=True, check_input=True,
                     **unused_args):
    warn_unused_args(unused_args)

    if check_input:
        X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES)
    elif copy:
        X = X.copy(order='K')

    y = np.asarray(y, dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)

    if fit_intercept:
        X_offset = mean(X, axis=0, range=range_X, epsilon=epsilon)
        X -= X_offset
        y_offset = mean(y, axis=0, range=range_y, epsilon=epsilon)
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale
def _preprocess_data(X,
                     y,
                     fit_intercept,
                     epsilon=1.0,
                     bounds_X=None,
                     bounds_y=None,
                     copy=True,
                     check_input=True,
                     **unused_args):
    warn_unused_args(unused_args)

    if check_input:
        X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES)
    elif copy:
        X = X.copy(order='K')

    y = np.asarray(y, dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)

    if fit_intercept:
        bounds_X = check_bounds(bounds_X, X.shape[1])
        bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1)

        X = clip_to_bounds(X, bounds_X)
        y = clip_to_bounds(y, bounds_y)

        X_offset = mean(X,
                        axis=0,
                        bounds=bounds_X,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        X -= X_offset
        y_offset = mean(y,
                        axis=0,
                        bounds=bounds_y,
                        epsilon=epsilon,
                        accountant=BudgetAccountant())
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale
    def _fit_full(self, X, n_components):
        self.accountant.check(self.epsilon, 0)

        n_samples, n_features = X.shape

        if self.centered:
            self.mean_ = np.zeros_like(np.mean(X, axis=0))
        else:
            if self.bounds is None:
                warnings.warn(
                    "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `range` for each valued returned by np.mean().",
                    PrivacyLeakWarning)

                self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

            self.bounds = check_bounds(self.bounds, n_features)
            self.mean_ = mean(X,
                              epsilon=self.epsilon / 2,
                              bounds=self.bounds,
                              axis=0,
                              accountant=BudgetAccountant())

        X -= self.mean_

        if self.data_norm is None:
            warnings.warn(
                "Data norm has not been specified and will be calculated on the data provided.  This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify `data_norm` at initialisation.",
                PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        XtX = np.dot(X.T, X)

        mech = Wishart().set_epsilon(self.epsilon if self.centered else self.epsilon / 2).\
            set_sensitivity(self.data_norm)
        noisy_input = mech.randomise(XtX)

        u, s, v = np.linalg.svd(noisy_input)
        u, v = svd_flip(u, v)
        s = np.sqrt(s)

        components_ = v

        # Get variance explained by singular values
        explained_variance_ = (s**2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = s.copy()  # Store the singular values.

        # Post-process the number of components required
        if n_components == 'mle':
            try:
                n_components = sk_pca._infer_dimension(explained_variance_,
                                                       n_samples)
            except AttributeError:
                n_components = sk_pca._infer_dimension_(
                    explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = singular_values_[:n_components]

        self.accountant.spend(self.epsilon, 0)

        return u, s, v
Beispiel #4
0
    def _fit_full(self, X, n_components):
        self.accountant.check(self.epsilon, 0)

        n_samples, n_features = X.shape

        if self.centered:
            self.mean_ = np.zeros_like(np.mean(X, axis=0))
        else:
            if self.bounds is None:
                warnings.warn(
                    "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `range` for each valued returned by np.mean().",
                    PrivacyLeakWarning)

                self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

            self.bounds = self._check_bounds(self.bounds, n_features)
            self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant())

        X -= self.mean_

        if self.data_norm is None:
            warnings.warn("Data norm has not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = self._clip_to_norm(X, self.data_norm)

        sigma_vec, u_mtx = covariance_eig(X, epsilon=self.epsilon if self.centered else self.epsilon / 2,
                                          norm=self.data_norm,
                                          dims=n_components if isinstance(n_components, Integral) else None)
        u_mtx, _ = svd_flip(u_mtx, np.zeros_like(u_mtx).T)
        sigma_vec = np.sqrt(sigma_vec)

        components_ = u_mtx.T

        # Get variance explained by singular values
        explained_variance_ = np.sort((sigma_vec ** 2) / (n_samples - 1))[::-1]
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var
        singular_values_ = sigma_vec.copy()  # Store the singular values.

        # Post-process the number of components required
        if n_components == 'mle':
            n_components = sk_pca._infer_dimension(explained_variance_, n_samples)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
        self.singular_values_ = singular_values_[:n_components]

        self.accountant.spend(self.epsilon, 0)

        return u_mtx, sigma_vec[:n_components], u_mtx.T