Esempio n. 1
0
    def fit(self, X):
        """Fit detector.

        Parameters
        ----------
        X : dataframe of shape (n_samples, n_features)
            The input samples.
        """
        # validate inputs X and y (optional)
        X = X.to_numpy()
        X = check_array(X)
        # PCA is recommended to use on the standardized data (zero mean and
        # unit variance).
        if self.standardization:
            X, self.scaler_ = standardizer(X, keep_scalar=True)

        self.detector_ = sklearn_PCA(n_components=self.n_components,
                                     copy=self.copy,
                                     whiten=self.whiten,
                                     svd_solver=self.svd_solver,
                                     tol=self.tol,
                                     iterated_power=self.iterated_power,
                                     random_state=self.random_state)
        self.detector_.fit(X=X, y=None)

        # copy the attributes from the sklearn PCA object
        self.n_components_ = self.detector_.n_components_
        self.components_ = self.detector_.components_

        # validate the number of components to be used for outlier detection
        if self.n_selected_components is None:
            self.n_selected_components_ = self.n_components_
        else:
            self.n_selected_components_ = self.n_selected_components
        check_parameter(self.n_selected_components_, 1, self.n_components_,
                        include_left=True, include_right=True,
                        param_name='n_selected_components_')

        # use eigenvalues as the weights of eigenvectors
        self.w_components_ = np.ones([self.n_components_, ])
        if self.weighted:
            self.w_components_ = self.detector_.explained_variance_ratio_

        # outlier scores is the sum of the weighted distances between each
        # sample to the eigenvectors. The eigenvectors with smaller
        # eigenvalues have more influence
        # Not all eigenvectors are used, only n_selected_components_ smallest
        # are used since they better reflect the variance change

        self.selected_components_ = self.components_[
                                    -1 * self.n_selected_components_:, :]
        self.selected_w_components_ = self.w_components_[
                                      -1 * self.n_selected_components_:]

        self.decision_scores_ = np.sum(
            cdist(X, self.selected_components_) / self.selected_w_components_,
            axis=1).ravel()

        self._process_decision_scores()
        return self
Esempio n. 2
0
    def __init__(self, n_bins=10, alpha=0.1, tol=0.5, contamination=0.1):
        super(HBOS, self).__init__()
        self.n_bins = n_bins
        self.alpha = alpha
        self.tol = tol
        self.contamination = contamination
        self.threshold = None

        check_parameter(alpha, 0, 1, param_name='alpha')
        check_parameter(tol, 0, 1, param_name='tol')
Esempio n. 3
0
    def __init__(self,
                 contamination=0.1,
                 n_neighbors=20,
                 ref_set=10,
                 alpha=0.8):
        super(SOD, self).__init__()
        if isinstance(n_neighbors, int):
            check_parameter(n_neighbors, low=1, param_name='n_neighbors')
        else:
            raise ValueError("n_neighbors should be int. Got %s" %
                             type(n_neighbors))

        if isinstance(ref_set, int):
            check_parameter(ref_set,
                            low=1,
                            high=n_neighbors,
                            param_name='ref_set')
        else:
            raise ValueError("ref_set should be int. Got %s" % type(ref_set))

        if isinstance(alpha, float):
            check_parameter(alpha, low=0.0, high=1.0, param_name='alpha')
        else:
            raise ValueError("alpha should be float. Got %s" % type(alpha))

        self.n_neighbors_ = n_neighbors
        self.ref_set_ = ref_set
        self.alpha_ = alpha
        self.decision_scores_ = None
        self.contamination = contamination
Esempio n. 4
0
def test_function():
    check_parameter(2, low=1, high=3)
    standardizer(np.random.rand(3, 2) * 100)
    str2bool('True')
Esempio n. 5
0
def function_test():
    check_parameter(2)
    standardizer(np.random.rand(3,2)*100)
    str2bool('True')