def train():
    dataset = get_data(1000, 10, 100)
    contamination = 0.01
    with mlflow.start_run():
        base_estimators = [
            LOF(n_neighbors=5, contamination=contamination),
            LOF(n_neighbors=15, contamination=contamination),
            LOF(n_neighbors=25, contamination=contamination),
            PCA(contamination=contamination),
            KNN(n_neighbors=5, contamination=contamination),
            KNN(n_neighbors=15, contamination=contamination),
            KNN(n_neighbors=25, contamination=contamination)]
        model = SUOD(base_estimators=base_estimators, n_jobs=6,  
                    rp_flag_global=True,  
                    bps_flag=True,  
                    approx_flag_global=False, 
                    contamination=contamination)
        model.fit(dataset)  
        model.approximate(dataset)  
        predicted_labels = model.predict(dataset)
        voted_labels = vote(predicted_labels)
        true_labels = [0]*1000 + [1]*10
        auc_score = roc_auc_score(voted_labels, true_labels)
        print("The resulted area under the ROC curve score is {}".format(auc_score))
        mlflow.log_metric("auc_score", auc_score)
        mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
Example #2
0
class ensemble(abstract_occ_model):
    """
    
    """
    def __init__(self, nu=0.1):
        self.base_estimators = [
            #OCSVM(contamination=nu),
            KNN(n_neighbors=100, contamination=nu),
            KNN(n_neighbors=25, contamination=nu),
            KNN(n_neighbors=5, contamination=nu),
            IForest(contamination=nu)
        ]
        self.model = SUOD(base_estimators=self.base_estimators,
                          rp_flag_global=False,
                          bps_flag=True,
                          approx_flag_global=False)
        self.scores = None

    def fit(self, X):
        self.model.fit(X)
        self.model.approximate(X)

    def predict(self, X):
        self.scores = self.compute_score(X)
        return np.where(self.scores >= 0.5, 1,
                        np.where(self.scores < 0.5, -1, self.scores))

    def score_samples(self, X):
        if type(self.scores) != np.ndarray:
            self.scores = self.compute_score(X)
            return self.scores
        else:
            return self.scores

    def compute_score(self, X):
        mean_prob = np.mean(self.model.predict_proba(X), axis=1)
        return mean_prob
Example #3
0
    ##########################################################################
    model = SUOD(base_estimators=base_estimators,
                 rp_flag_global=True,
                 approx_clf=approx_clf,
                 n_jobs=n_jobs,
                 bps_flag=True,
                 contamination=contamination,
                 approx_flag_global=True)

    start = time.time()
    model.fit(X_train)  # fit all models with X
    print('Fit time:', time.time() - start)
    print()

    start = time.time()
    model.approximate(X_train)  # conduct model approximation if it is enabled
    print('Approximation time:', time.time() - start)
    print()

    start = time.time()
    predicted_labels = model.predict(X_test)  # predict labels
    print('Predict time:', time.time() - start)
    print()

    start = time.time()
    predicted_scores = model.decision_function(X_test)  # predict scores
    print('Decision Function time:', time.time() - start)
    print()

    ##########################################################################
    # compare with no projection, no bps, and no approximation
Example #4
0
        X = np.append(arr=X, values=features_pca, axis=0)
        X_num = X.shape[0]
        base_estimators = [LOF(), IForest(), OCSVM(kernel="rbf", gamma=0.001)]

        model = SUOD(
            base_estimators=base_estimators,
            n_jobs=2,  # number of workers(if -1 it use full core)
            rp_flag_global=True,  # global flag for random projection
            bps_flag=True,  # global flag for balanced parallel scheduling
            approx_flag_global=False,  # global flag for model approximation
            contamination=0.2)

        # X_train, X_test = train_test_split(X, test_size=0, random_state=123)\
        model.fit(X)
        model.approximate(X)
        predicted_labels = model.predict(X)

        sum_labels = np.sum(predicted_labels, axis=1) / 3
        sum_labels = np.where(sum_labels >= 0.5, -1,
                              1)  # -1 abnormal, 1 normal
        result_label = np.average(sum_labels)
        result_label = result_label.tolist()

        # Add outliers
        fig = plt.figure()
        colors = np.array(['r', 'b'])
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(X[:, 0],
                   X[:, 1],
                   X[:, 2],
Example #5
0
class TestBASE(unittest.TestCase):
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)],
                random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2,
                          rp_flag_global=True, bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_)

    def test_initialization(self):
        self.model.get_params()
        self.model.set_params(**{'n_jobs': 4})

    def test_fit(self):
        """
        Test base class initialization

        :return:
        """
        self.model.fit(self.X_train)

    def test_approximate(self):
        self.model.fit(self.X_train)
        self.model.approximate(self.X_train)

    def test_predict(self):
        self.model.fit(self.X_train)
        self.model.approximate(self.X_train)
        self.model.predict(self.X_test)

    def test_decision_function(self):
        self.model.fit(self.X_train)
        self.model.approximate(self.X_train)
        self.model.decision_function(self.X_test)
Example #6
0
class SUOD(BaseDetector):
    # noinspection PyPep8
    """SUOD (Scalable Unsupervised Outlier Detection) is an acceleration
    framework for large scale unsupervised outlier detector training and
    prediction. See :cite:`zhao2021suod` for details.

    Parameters
    ----------
    base_estimators : list, length must be greater than 1
        A list of base estimators. Certain methods must be present, e.g.,
        `fit` and `predict`.

    combination : str, optional (default='average')
        Decide how to aggregate the results from multiple models:

        - "average" : average the results from all base detectors
        - "maximization" : output the max value across all base detectors

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    n_jobs : optional (default=1)
        The number of jobs to run in parallel for both `fit` and
        `predict`. If -1, then the number of jobs is set to the
        the number of jobs that can actually run in parallel.

    rp_clf_list : list, optional (default=None)
        The list of outlier detection models to use random projection. The
        detector name should be consistent with PyOD.

    rp_ng_clf_list : list, optional (default=None)
        The list of outlier detection models NOT to use random projection. The
        detector name should be consistent with PyOD.

    rp_flag_global : bool, optional (default=True)
        If set to False, random projection is turned off for all base models.

    target_dim_frac : float in (0., 1), optional (default=0.5)
        The target compression ratio.

    jl_method : string, optional (default = 'basic')
        The JL projection method:

        - "basic": each component of the transformation matrix is taken at
          random in N(0,1).
        - "discrete", each component of the transformation matrix is taken at
          random in {-1,1}.
        - "circulant": the first row of the transformation matrix is taken at
          random in N(0,1), and each row is obtained from the previous one
          by a one-left shift.
        - "toeplitz": the first row and column of the transformation matrix
          is taken at random in N(0,1), and each diagonal has a constant value
          taken from these first vector.

    bps_flag : bool, optional (default=True)
        If set to False, balanced parallel scheduling is turned off.

    approx_clf_list : list, optional (default=None)
        The list of outlier detection models to use pseudo-supervised
        approximation. The detector name should be consistent with PyOD.

    approx_ng_clf_list : list, optional (default=None)
        The list of outlier detection models NOT to use pseudo-supervised
        approximation. The detector name should be consistent with PyOD.

    approx_flag_global : bool, optional (default=True)
        If set to False, pseudo-supervised approximation is turned off.

    approx_clf : object, optional (default: sklearn RandomForestRegressor)
        The supervised model used to approximate unsupervised models.

    cost_forecast_loc_fit : str, optional
        The location of the pretrained cost prediction forecast for training.

    cost_forecast_loc_pred : str, optional
        The location of the pretrained cost prediction forecast for prediction.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """
    def __init__(self,
                 base_estimators=None,
                 contamination=0.1,
                 combination='average',
                 n_jobs=None,
                 rp_clf_list=None,
                 rp_ng_clf_list=None,
                 rp_flag_global=True,
                 target_dim_frac=0.5,
                 jl_method='basic',
                 bps_flag=True,
                 approx_clf_list=None,
                 approx_ng_clf_list=None,
                 approx_flag_global=True,
                 approx_clf=None,
                 cost_forecast_loc_fit=None,
                 cost_forecast_loc_pred=None,
                 verbose=False):
        super(SUOD, self).__init__(contamination=contamination)
        self.base_estimators = base_estimators
        self.contamination = contamination
        self.combination = combination
        self.n_jobs = n_jobs
        self.rp_clf_list = rp_clf_list
        self.rp_ng_clf_list = rp_ng_clf_list
        self.rp_flag_global = rp_flag_global
        self.target_dim_frac = target_dim_frac
        self.jl_method = jl_method
        self.bps_flag = bps_flag
        self.approx_clf_list = approx_clf_list
        self.approx_ng_clf_list = approx_ng_clf_list
        self.approx_flag_global = approx_flag_global
        self.approx_clf = approx_clf
        self.cost_forecast_loc_fit = cost_forecast_loc_fit
        self.cost_forecast_loc_pred = cost_forecast_loc_pred
        self.verbose = verbose

        # by default we will provide a group of performing models
        if self.base_estimators is None:
            self.base_estimators = [
                LOF(n_neighbors=15),
                LOF(n_neighbors=20),
                HBOS(n_bins=10),
                HBOS(n_bins=20),
                COPOD(),
                IForest(n_estimators=50),
                IForest(n_estimators=100),
                IForest(n_estimators=150)
            ]

        self.n_estimators = len(self.base_estimators)

        # pass in the arguments for SUOD model
        self.model_ = SUOD_model(
            base_estimators=self.base_estimators,
            contamination=self.contamination,
            n_jobs=self.n_jobs,
            rp_clf_list=self.rp_clf_list,
            rp_ng_clf_list=self.rp_ng_clf_list,
            rp_flag_global=self.rp_flag_global,
            target_dim_frac=self.target_dim_frac,
            jl_method=self.jl_method,
            approx_clf_list=self.approx_clf_list,
            approx_ng_clf_list=self.approx_ng_clf_list,
            approx_flag_global=self.approx_flag_global,
            approx_clf=self.approx_clf,
            bps_flag=self.bps_flag,
            cost_forecast_loc_fit=self.cost_forecast_loc_fit,
            cost_forecast_loc_pred=self.cost_forecast_loc_pred,
            verbose=self.verbose,
        )

    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """

        # validate inputs X and y (optional)
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]
        self._set_n_classes(y)

        # fit the model and then approximate it
        self.model_.fit(X)
        self.model_.approximate(X)

        # get the decision scores from each base estimators
        decision_score_mat = np.zeros([n_samples, self.n_estimators])
        for i in range(self.n_estimators):
            decision_score_mat[:, i] = self.model_.base_estimators[
                i].decision_scores_

        # the scores must be standardized before combination
        decision_score_mat, self.score_scalar_ = standardizer(
            decision_score_mat, keep_scalar=True)

        # todo: may support other combination
        if self.combination == 'average':
            decision_score = average(decision_score_mat)
        else:
            decision_score = maximization(decision_score_mat)

        assert (len(decision_score) == n_samples)

        self.decision_scores_ = decision_score.ravel()
        self._process_decision_scores()

        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detectors.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(
            self, ['model_', 'decision_scores_', 'threshold_', 'labels_'])

        X = check_array(X)

        # initialize the output score
        predicted_scores = self.model_.decision_function(X)

        # standardize the score and combine
        predicted_scores = self.score_scalar_.transform(predicted_scores)

        # todo: may support other combination
        if self.combination == 'average':
            decision_score = average(predicted_scores)
        else:
            decision_score = maximization(predicted_scores)

        assert (len(decision_score) == X.shape[0])

        return decision_score.ravel()
Example #7
0
class TestModelSaveLoad(unittest.TestCase):
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)],
                random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2,
                          rp_flag_global=True, bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
                          verbose=True)

    def test_save(self):
        self.model.fit(self.X_train)  # fit all models with X
        self.model.approximate(
            self.X_train)  # conduct model approximation if it is enabled

        # save the model
        dump(self.model, 'model.joblib')
        assert (os.path.exists('model.joblib'))
        os.remove('model.joblib')
        
    def test_load(self):
        self.model.fit(self.X_train)  # fit all models with X
        self.model.approximate(
            self.X_train)  # conduct model approximation if it is enabled

        # save the model
        dump(self.model, 'model.joblib')
        model = load('model.joblib')

        predicted_labels = model.predict(self.X_test)  # predict labels
        predicted_scores = model.decision_function(self.X_test)  # predict scores
        predicted_probs = model.predict_proba(self.X_test)  # predict scores

        assert (len(predicted_labels) != 0)
        # assert (predicted_scores)
        # assert (predicted_probs)
        
    def tearDown(self):
        if os.path.exists('model.joblib'):
            os.remove('model.joblib')