Ejemplo n.º 1
0
    def fit(self, X):
        """Fit detector.

        Parameters
        ----------
        X : dataframe of shape (n_samples, n_features)
            The input samples.
        """
        # validate inputs X and y (optional)
        X = X.to_numpy()
        X = check_array(X)
        # PCA is recommended to use on the standardized data (zero mean and
        # unit variance).
        if self.standardization:
            X, self.scaler_ = standardizer(X, keep_scalar=True)

        self.detector_ = sklearn_PCA(n_components=self.n_components,
                                     copy=self.copy,
                                     whiten=self.whiten,
                                     svd_solver=self.svd_solver,
                                     tol=self.tol,
                                     iterated_power=self.iterated_power,
                                     random_state=self.random_state)
        self.detector_.fit(X=X, y=None)

        # copy the attributes from the sklearn PCA object
        self.n_components_ = self.detector_.n_components_
        self.components_ = self.detector_.components_

        # validate the number of components to be used for outlier detection
        if self.n_selected_components is None:
            self.n_selected_components_ = self.n_components_
        else:
            self.n_selected_components_ = self.n_selected_components
        check_parameter(self.n_selected_components_, 1, self.n_components_,
                        include_left=True, include_right=True,
                        param_name='n_selected_components_')

        # use eigenvalues as the weights of eigenvectors
        self.w_components_ = np.ones([self.n_components_, ])
        if self.weighted:
            self.w_components_ = self.detector_.explained_variance_ratio_

        # outlier scores is the sum of the weighted distances between each
        # sample to the eigenvectors. The eigenvectors with smaller
        # eigenvalues have more influence
        # Not all eigenvectors are used, only n_selected_components_ smallest
        # are used since they better reflect the variance change

        self.selected_components_ = self.components_[
                                    -1 * self.n_selected_components_:, :]
        self.selected_w_components_ = self.w_components_[
                                      -1 * self.n_selected_components_:]

        self.decision_scores_ = np.sum(
            cdist(X, self.selected_components_) / self.selected_w_components_,
            axis=1).ravel()

        self._process_decision_scores()
        return self
Ejemplo n.º 2
0
def test_function():
    check_parameter(2, low=1, high=3)
    standardizer(np.random.rand(3, 2) * 100)
    str2bool('True')
Ejemplo n.º 3
0
def function_test():
    check_parameter(2)
    standardizer(np.random.rand(3,2)*100)
    str2bool('True')