Example #1
0
    model.fit(X_train)  # fit all models with X
    print('Fit time:', time.time() - start)
    print()

    start = time.time()
    model.approximate(X_train)  # conduct model approximation if it is enabled
    print('Approximation time:', time.time() - start)
    print()

    start = time.time()
    predicted_labels = model.predict(X_test)  # predict labels
    print('Predict time:', time.time() - start)
    print()

    start = time.time()
    predicted_scores = model.decision_function(X_test)  # predict scores
    print('Decision Function time:', time.time() - start)
    print()

    ##########################################################################
    # compare with no projection, no bps, and no approximation
    print("******************************************************************")
    start = time.time()
    n_estimators = len(base_estimators)
    n_estimators_list, starts, n_jobs = _partition_estimators(
        n_estimators, n_jobs)

    rp_flags = np.zeros([n_estimators, 1])
    approx_flags = np.zeros([n_estimators, 1])
    objective_dim = None
    rp_method = None
Example #2
0
class SUOD(BaseDetector):
    # noinspection PyPep8
    """SUOD (Scalable Unsupervised Outlier Detection) is an acceleration
    framework for large scale unsupervised outlier detector training and
    prediction. See :cite:`zhao2021suod` for details.

    Parameters
    ----------
    base_estimators : list, length must be greater than 1
        A list of base estimators. Certain methods must be present, e.g.,
        `fit` and `predict`.

    combination : str, optional (default='average')
        Decide how to aggregate the results from multiple models:

        - "average" : average the results from all base detectors
        - "maximization" : output the max value across all base detectors

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    n_jobs : optional (default=1)
        The number of jobs to run in parallel for both `fit` and
        `predict`. If -1, then the number of jobs is set to the
        the number of jobs that can actually run in parallel.

    rp_clf_list : list, optional (default=None)
        The list of outlier detection models to use random projection. The
        detector name should be consistent with PyOD.

    rp_ng_clf_list : list, optional (default=None)
        The list of outlier detection models NOT to use random projection. The
        detector name should be consistent with PyOD.

    rp_flag_global : bool, optional (default=True)
        If set to False, random projection is turned off for all base models.

    target_dim_frac : float in (0., 1), optional (default=0.5)
        The target compression ratio.

    jl_method : string, optional (default = 'basic')
        The JL projection method:

        - "basic": each component of the transformation matrix is taken at
          random in N(0,1).
        - "discrete", each component of the transformation matrix is taken at
          random in {-1,1}.
        - "circulant": the first row of the transformation matrix is taken at
          random in N(0,1), and each row is obtained from the previous one
          by a one-left shift.
        - "toeplitz": the first row and column of the transformation matrix
          is taken at random in N(0,1), and each diagonal has a constant value
          taken from these first vector.

    bps_flag : bool, optional (default=True)
        If set to False, balanced parallel scheduling is turned off.

    approx_clf_list : list, optional (default=None)
        The list of outlier detection models to use pseudo-supervised
        approximation. The detector name should be consistent with PyOD.

    approx_ng_clf_list : list, optional (default=None)
        The list of outlier detection models NOT to use pseudo-supervised
        approximation. The detector name should be consistent with PyOD.

    approx_flag_global : bool, optional (default=True)
        If set to False, pseudo-supervised approximation is turned off.

    approx_clf : object, optional (default: sklearn RandomForestRegressor)
        The supervised model used to approximate unsupervised models.

    cost_forecast_loc_fit : str, optional
        The location of the pretrained cost prediction forecast for training.

    cost_forecast_loc_pred : str, optional
        The location of the pretrained cost prediction forecast for prediction.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """
    def __init__(self,
                 base_estimators=None,
                 contamination=0.1,
                 combination='average',
                 n_jobs=None,
                 rp_clf_list=None,
                 rp_ng_clf_list=None,
                 rp_flag_global=True,
                 target_dim_frac=0.5,
                 jl_method='basic',
                 bps_flag=True,
                 approx_clf_list=None,
                 approx_ng_clf_list=None,
                 approx_flag_global=True,
                 approx_clf=None,
                 cost_forecast_loc_fit=None,
                 cost_forecast_loc_pred=None,
                 verbose=False):
        super(SUOD, self).__init__(contamination=contamination)
        self.base_estimators = base_estimators
        self.contamination = contamination
        self.combination = combination
        self.n_jobs = n_jobs
        self.rp_clf_list = rp_clf_list
        self.rp_ng_clf_list = rp_ng_clf_list
        self.rp_flag_global = rp_flag_global
        self.target_dim_frac = target_dim_frac
        self.jl_method = jl_method
        self.bps_flag = bps_flag
        self.approx_clf_list = approx_clf_list
        self.approx_ng_clf_list = approx_ng_clf_list
        self.approx_flag_global = approx_flag_global
        self.approx_clf = approx_clf
        self.cost_forecast_loc_fit = cost_forecast_loc_fit
        self.cost_forecast_loc_pred = cost_forecast_loc_pred
        self.verbose = verbose

        # by default we will provide a group of performing models
        if self.base_estimators is None:
            self.base_estimators = [
                LOF(n_neighbors=15),
                LOF(n_neighbors=20),
                HBOS(n_bins=10),
                HBOS(n_bins=20),
                COPOD(),
                IForest(n_estimators=50),
                IForest(n_estimators=100),
                IForest(n_estimators=150)
            ]

        self.n_estimators = len(self.base_estimators)

        # pass in the arguments for SUOD model
        self.model_ = SUOD_model(
            base_estimators=self.base_estimators,
            contamination=self.contamination,
            n_jobs=self.n_jobs,
            rp_clf_list=self.rp_clf_list,
            rp_ng_clf_list=self.rp_ng_clf_list,
            rp_flag_global=self.rp_flag_global,
            target_dim_frac=self.target_dim_frac,
            jl_method=self.jl_method,
            approx_clf_list=self.approx_clf_list,
            approx_ng_clf_list=self.approx_ng_clf_list,
            approx_flag_global=self.approx_flag_global,
            approx_clf=self.approx_clf,
            bps_flag=self.bps_flag,
            cost_forecast_loc_fit=self.cost_forecast_loc_fit,
            cost_forecast_loc_pred=self.cost_forecast_loc_pred,
            verbose=self.verbose,
        )

    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """

        # validate inputs X and y (optional)
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]
        self._set_n_classes(y)

        # fit the model and then approximate it
        self.model_.fit(X)
        self.model_.approximate(X)

        # get the decision scores from each base estimators
        decision_score_mat = np.zeros([n_samples, self.n_estimators])
        for i in range(self.n_estimators):
            decision_score_mat[:, i] = self.model_.base_estimators[
                i].decision_scores_

        # the scores must be standardized before combination
        decision_score_mat, self.score_scalar_ = standardizer(
            decision_score_mat, keep_scalar=True)

        # todo: may support other combination
        if self.combination == 'average':
            decision_score = average(decision_score_mat)
        else:
            decision_score = maximization(decision_score_mat)

        assert (len(decision_score) == n_samples)

        self.decision_scores_ = decision_score.ravel()
        self._process_decision_scores()

        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detectors.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(
            self, ['model_', 'decision_scores_', 'threshold_', 'labels_'])

        X = check_array(X)

        # initialize the output score
        predicted_scores = self.model_.decision_function(X)

        # standardize the score and combine
        predicted_scores = self.score_scalar_.transform(predicted_scores)

        # todo: may support other combination
        if self.combination == 'average':
            decision_score = average(predicted_scores)
        else:
            decision_score = maximization(predicted_scores)

        assert (len(decision_score) == X.shape[0])

        return decision_score.ravel()
Example #3
0
class TestBASE(unittest.TestCase):
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)],
                random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2,
                          rp_flag_global=True, bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_)

    def test_initialization(self):
        self.model.get_params()
        self.model.set_params(**{'n_jobs': 4})

    def test_fit(self):
        """
        Test base class initialization

        :return:
        """
        self.model.fit(self.X_train)

    def test_approximate(self):
        self.model.fit(self.X_train)
        self.model.approximate(self.X_train)

    def test_predict(self):
        self.model.fit(self.X_train)
        self.model.approximate(self.X_train)
        self.model.predict(self.X_test)

    def test_decision_function(self):
        self.model.fit(self.X_train)
        self.model.approximate(self.X_train)
        self.model.decision_function(self.X_test)
Example #4
0
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        LSCP(detector_list=[LOF(contamination=contamination),
                            LOF(contamination=contamination)])
    ]
    
    
    model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True, 
                 contamination=contamination, approx_flag_global=False)

    model.fit(X)  # fit all models with X
    model.approximate(X)  # conduct model approximation if it is enabled
    predicted_labels = model.predict(X)  # predict labels on X; for demo purpose only
    predicted_scores = model.decision_function(X)  # predict scores on X; for demo purpose only

    # %%
    evaluate_print('majority vote', y, majority_vote(predicted_labels))
    evaluate_print('average', y, average(predicted_scores))
    evaluate_print('maximization', y, maximization(predicted_scores))

    clf = LOF()
    clf.fit(X)
    evaluate_print('LOF', y, clf.decision_scores_)

    clf = IForest()
    clf.fit(X)
    evaluate_print('IForest', y, clf.decision_scores_)