def __init__(self, nu=0.1): self.base_estimators = [ #OCSVM(contamination=nu), KNN(n_neighbors=100, contamination=nu), KNN(n_neighbors=25, contamination=nu), KNN(n_neighbors=5, contamination=nu), IForest(contamination=nu) ] self.model = SUOD(base_estimators=self.base_estimators, rp_flag_global=False, bps_flag=True, approx_flag_global=False) self.scores = None
def train(): dataset = get_data(1000, 10, 100) contamination = 0.01 with mlflow.start_run(): base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination)] model = SUOD(base_estimators=base_estimators, n_jobs=6, rp_flag_global=True, bps_flag=True, approx_flag_global=False, contamination=contamination) model.fit(dataset) model.approximate(dataset) predicted_labels = model.predict(dataset) voted_labels = vote(predicted_labels) true_labels = [0]*1000 + [1]*10 auc_score = roc_auc_score(voted_labels, true_labels) print("The resulted area under the ROC curve score is {}".format(auc_score)) mlflow.log_metric("auc_score", auc_score) mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination) ], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_, verbose=True)
class ensemble(abstract_occ_model): """ """ def __init__(self, nu=0.1): self.base_estimators = [ #OCSVM(contamination=nu), KNN(n_neighbors=100, contamination=nu), KNN(n_neighbors=25, contamination=nu), KNN(n_neighbors=5, contamination=nu), IForest(contamination=nu) ] self.model = SUOD(base_estimators=self.base_estimators, rp_flag_global=False, bps_flag=True, approx_flag_global=False) self.scores = None def fit(self, X): self.model.fit(X) self.model.approximate(X) def predict(self, X): self.scores = self.compute_score(X) return np.where(self.scores >= 0.5, 1, np.where(self.scores < 0.5, -1, self.scores)) def score_samples(self, X): if type(self.scores) != np.ndarray: self.scores = self.compute_score(X) return self.scores else: return self.scores def compute_score(self, X): mean_prob = np.mean(self.model.predict_proba(X), axis=1) return mean_prob
X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) # standardize data to be digestible for most algorithms X_train, X_test = standardizer(X_train, X_test) contamination = y.sum() / len(y) # get estimators for training and prediction base_estimators = get_estimators(contamination=contamination) ########################################################################## model = SUOD(base_estimators=base_estimators, rp_flag_global=True, approx_clf=approx_clf, n_jobs=n_jobs, bps_flag=True, contamination=contamination, approx_flag_global=True) start = time.time() model.fit(X_train) # fit all models with X print('Fit time:', time.time() - start) print() start = time.time() model.approximate(X_train) # conduct model approximation if it is enabled print('Approximation time:', time.time() - start) print() start = time.time()
""" anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope()), ("One-Class SVM", svm.OneClassSVM(kernel="rbf", gamma=0.001)), ("Isolation Forest", IsolationForest(random_state=42)), ("Local Outlier Factor", LocalOutlierFactor()) ] X = np.append(arr=X, values=features_pca, axis=0) X_num = X.shape[0] base_estimators = [LOF(), IForest(), OCSVM(kernel="rbf", gamma=0.001)] model = SUOD( base_estimators=base_estimators, n_jobs=2, # number of workers(if -1 it use full core) rp_flag_global=True, # global flag for random projection bps_flag=True, # global flag for balanced parallel scheduling approx_flag_global=False, # global flag for model approximation contamination=0.2) # X_train, X_test = train_test_split(X, test_size=0, random_state=123)\ model.fit(X) model.approximate(X) predicted_labels = model.predict(X) sum_labels = np.sum(predicted_labels, axis=1) / 3 sum_labels = np.where(sum_labels >= 0.5, -1, 1) # -1 abnormal, 1 normal result_label = np.average(sum_labels) result_label = result_label.tolist()
class TestBASE(unittest.TestCase): def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination)], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_) def test_initialization(self): self.model.get_params() self.model.set_params(**{'n_jobs': 4}) def test_fit(self): """ Test base class initialization :return: """ self.model.fit(self.X_train) def test_approximate(self): self.model.fit(self.X_train) self.model.approximate(self.X_train) def test_predict(self): self.model.fit(self.X_train) self.model.approximate(self.X_train) self.model.predict(self.X_test) def test_decision_function(self): self.model.fit(self.X_train) self.model.approximate(self.X_train) self.model.decision_function(self.X_test)
class SUOD(BaseDetector): # noinspection PyPep8 """SUOD (Scalable Unsupervised Outlier Detection) is an acceleration framework for large scale unsupervised outlier detector training and prediction. See :cite:`zhao2021suod` for details. Parameters ---------- base_estimators : list, length must be greater than 1 A list of base estimators. Certain methods must be present, e.g., `fit` and `predict`. combination : str, optional (default='average') Decide how to aggregate the results from multiple models: - "average" : average the results from all base detectors - "maximization" : output the max value across all base detectors contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. n_jobs : optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the the number of jobs that can actually run in parallel. rp_clf_list : list, optional (default=None) The list of outlier detection models to use random projection. The detector name should be consistent with PyOD. rp_ng_clf_list : list, optional (default=None) The list of outlier detection models NOT to use random projection. The detector name should be consistent with PyOD. rp_flag_global : bool, optional (default=True) If set to False, random projection is turned off for all base models. target_dim_frac : float in (0., 1), optional (default=0.5) The target compression ratio. jl_method : string, optional (default = 'basic') The JL projection method: - "basic": each component of the transformation matrix is taken at random in N(0,1). - "discrete", each component of the transformation matrix is taken at random in {-1,1}. - "circulant": the first row of the transformation matrix is taken at random in N(0,1), and each row is obtained from the previous one by a one-left shift. - "toeplitz": the first row and column of the transformation matrix is taken at random in N(0,1), and each diagonal has a constant value taken from these first vector. bps_flag : bool, optional (default=True) If set to False, balanced parallel scheduling is turned off. approx_clf_list : list, optional (default=None) The list of outlier detection models to use pseudo-supervised approximation. The detector name should be consistent with PyOD. approx_ng_clf_list : list, optional (default=None) The list of outlier detection models NOT to use pseudo-supervised approximation. The detector name should be consistent with PyOD. approx_flag_global : bool, optional (default=True) If set to False, pseudo-supervised approximation is turned off. approx_clf : object, optional (default: sklearn RandomForestRegressor) The supervised model used to approximate unsupervised models. cost_forecast_loc_fit : str, optional The location of the pretrained cost prediction forecast for training. cost_forecast_loc_pred : str, optional The location of the pretrained cost prediction forecast for prediction. verbose : int, optional (default=0) Controls the verbosity of the building process. Attributes ---------- decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, base_estimators=None, contamination=0.1, combination='average', n_jobs=None, rp_clf_list=None, rp_ng_clf_list=None, rp_flag_global=True, target_dim_frac=0.5, jl_method='basic', bps_flag=True, approx_clf_list=None, approx_ng_clf_list=None, approx_flag_global=True, approx_clf=None, cost_forecast_loc_fit=None, cost_forecast_loc_pred=None, verbose=False): super(SUOD, self).__init__(contamination=contamination) self.base_estimators = base_estimators self.contamination = contamination self.combination = combination self.n_jobs = n_jobs self.rp_clf_list = rp_clf_list self.rp_ng_clf_list = rp_ng_clf_list self.rp_flag_global = rp_flag_global self.target_dim_frac = target_dim_frac self.jl_method = jl_method self.bps_flag = bps_flag self.approx_clf_list = approx_clf_list self.approx_ng_clf_list = approx_ng_clf_list self.approx_flag_global = approx_flag_global self.approx_clf = approx_clf self.cost_forecast_loc_fit = cost_forecast_loc_fit self.cost_forecast_loc_pred = cost_forecast_loc_pred self.verbose = verbose # by default we will provide a group of performing models if self.base_estimators is None: self.base_estimators = [ LOF(n_neighbors=15), LOF(n_neighbors=20), HBOS(n_bins=10), HBOS(n_bins=20), COPOD(), IForest(n_estimators=50), IForest(n_estimators=100), IForest(n_estimators=150) ] self.n_estimators = len(self.base_estimators) # pass in the arguments for SUOD model self.model_ = SUOD_model( base_estimators=self.base_estimators, contamination=self.contamination, n_jobs=self.n_jobs, rp_clf_list=self.rp_clf_list, rp_ng_clf_list=self.rp_ng_clf_list, rp_flag_global=self.rp_flag_global, target_dim_frac=self.target_dim_frac, jl_method=self.jl_method, approx_clf_list=self.approx_clf_list, approx_ng_clf_list=self.approx_ng_clf_list, approx_flag_global=self.approx_flag_global, approx_clf=self.approx_clf, bps_flag=self.bps_flag, cost_forecast_loc_fit=self.cost_forecast_loc_fit, cost_forecast_loc_pred=self.cost_forecast_loc_pred, verbose=self.verbose, ) def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # validate inputs X and y (optional) X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] self._set_n_classes(y) # fit the model and then approximate it self.model_.fit(X) self.model_.approximate(X) # get the decision scores from each base estimators decision_score_mat = np.zeros([n_samples, self.n_estimators]) for i in range(self.n_estimators): decision_score_mat[:, i] = self.model_.base_estimators[ i].decision_scores_ # the scores must be standardized before combination decision_score_mat, self.score_scalar_ = standardizer( decision_score_mat, keep_scalar=True) # todo: may support other combination if self.combination == 'average': decision_score = average(decision_score_mat) else: decision_score = maximization(decision_score_mat) assert (len(decision_score) == n_samples) self.decision_scores_ = decision_score.ravel() self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detectors. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted( self, ['model_', 'decision_scores_', 'threshold_', 'labels_']) X = check_array(X) # initialize the output score predicted_scores = self.model_.decision_function(X) # standardize the score and combine predicted_scores = self.score_scalar_.transform(predicted_scores) # todo: may support other combination if self.combination == 'average': decision_score = average(predicted_scores) else: decision_score = maximization(predicted_scores) assert (len(decision_score) == X.shape[0]) return decision_score.ravel()
def __init__(self, base_estimators=None, contamination=0.1, combination='average', n_jobs=None, rp_clf_list=None, rp_ng_clf_list=None, rp_flag_global=True, target_dim_frac=0.5, jl_method='basic', bps_flag=True, approx_clf_list=None, approx_ng_clf_list=None, approx_flag_global=True, approx_clf=None, cost_forecast_loc_fit=None, cost_forecast_loc_pred=None, verbose=False): super(SUOD, self).__init__(contamination=contamination) self.base_estimators = base_estimators self.contamination = contamination self.combination = combination self.n_jobs = n_jobs self.rp_clf_list = rp_clf_list self.rp_ng_clf_list = rp_ng_clf_list self.rp_flag_global = rp_flag_global self.target_dim_frac = target_dim_frac self.jl_method = jl_method self.bps_flag = bps_flag self.approx_clf_list = approx_clf_list self.approx_ng_clf_list = approx_ng_clf_list self.approx_flag_global = approx_flag_global self.approx_clf = approx_clf self.cost_forecast_loc_fit = cost_forecast_loc_fit self.cost_forecast_loc_pred = cost_forecast_loc_pred self.verbose = verbose # by default we will provide a group of performing models if self.base_estimators is None: self.base_estimators = [ LOF(n_neighbors=15), LOF(n_neighbors=20), HBOS(n_bins=10), HBOS(n_bins=20), COPOD(), IForest(n_estimators=50), IForest(n_estimators=100), IForest(n_estimators=150) ] self.n_estimators = len(self.base_estimators) # pass in the arguments for SUOD model self.model_ = SUOD_model( base_estimators=self.base_estimators, contamination=self.contamination, n_jobs=self.n_jobs, rp_clf_list=self.rp_clf_list, rp_ng_clf_list=self.rp_ng_clf_list, rp_flag_global=self.rp_flag_global, target_dim_frac=self.target_dim_frac, jl_method=self.jl_method, approx_clf_list=self.approx_clf_list, approx_ng_clf_list=self.approx_ng_clf_list, approx_flag_global=self.approx_flag_global, approx_clf=self.approx_clf, bps_flag=self.bps_flag, cost_forecast_loc_fit=self.cost_forecast_loc_fit, cost_forecast_loc_pred=self.cost_forecast_loc_pred, verbose=self.verbose, )
class TestModelSaveLoad(unittest.TestCase): def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination)], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_, verbose=True) def test_save(self): self.model.fit(self.X_train) # fit all models with X self.model.approximate( self.X_train) # conduct model approximation if it is enabled # save the model dump(self.model, 'model.joblib') assert (os.path.exists('model.joblib')) os.remove('model.joblib') def test_load(self): self.model.fit(self.X_train) # fit all models with X self.model.approximate( self.X_train) # conduct model approximation if it is enabled # save the model dump(self.model, 'model.joblib') model = load('model.joblib') predicted_labels = model.predict(self.X_test) # predict labels predicted_scores = model.decision_function(self.X_test) # predict scores predicted_probs = model.predict_proba(self.X_test) # predict scores assert (len(predicted_labels) != 0) # assert (predicted_scores) # assert (predicted_probs) def tearDown(self): if os.path.exists('model.joblib'): os.remove('model.joblib')
HBOS(contamination=contamination), PCA(contamination=contamination), OCSVM(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), LSCP(detector_list=[LOF(contamination=contamination), LOF(contamination=contamination)]) ] model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True, contamination=contamination, approx_flag_global=False) model.fit(X) # fit all models with X model.approximate(X) # conduct model approximation if it is enabled predicted_labels = model.predict(X) # predict labels on X; for demo purpose only predicted_scores = model.decision_function(X) # predict scores on X; for demo purpose only # %% evaluate_print('majority vote', y, majority_vote(predicted_labels)) evaluate_print('average', y, average(predicted_scores)) evaluate_print('maximization', y, maximization(predicted_scores)) clf = LOF() clf.fit(X) evaluate_print('LOF', y, clf.decision_scores_)