contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, X_test, y_train, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train LOCI detector clf_name = 'LODA' clf = LODA() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
class TestLODA(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LODA(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'projections_') and self.clf.projections_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
iterate_threshold = True if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s P%(process)d %(levelname)s %(message)s", ) # load dataset data_dict = load_dataset( dataset, subdataset, "all", ) x_train = data_dict["train"] x_test = data_dict["test"] x_test_labels = data_dict["test_labels"] # data preprocessing for MSCRED od = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts) od.fit(x_train) # get outlier scores anomaly_score = od.decision_function(x_test) anomaly_label = x_test_labels # Make evaluation evaluate_all(anomaly_score, anomaly_label)
random_state=42) # load pretrained models prepare_trained_model() # recommended models selected_models = select_model(X_train, n_selection=100) print("Showing the top recommended models...") for i, model in enumerate(selected_models): print(i, model) print() model_1 = LODA(n_bins=5, n_random_cuts=100) print( "1st model Average Precision", average_precision_score(y_train, model_1.fit(X_train).decision_scores_)) model_10 = LODA(n_bins=5, n_random_cuts=20) print( "10th model Average Precision", average_precision_score(y_train, model_10.fit(X_train).decision_scores_)) model_50 = OCSVM(kernel='sigmoid', nu=0.6) print( "50th model Average Precision", average_precision_score(y_train, model_50.fit(X_train).decision_scores_))
# grogger df['arr86'] = (df['narr86'] >= 1).astype(int) Y = df['arr86'] X = df[[ 'pcnv', 'avgsen', 'tottime', 'ptime86', 'inc86', 'black', 'hispan', 'born60' ]] print(i, X.shape, Y.shape) if OD_Flag: # clf = HBOS(contamination=0.05) # clf = IForest(contamination=0.05) clf = LODA(contamination=0.05) clf.fit(X) # remove outliers X = X.loc[np.where(clf.labels_ == 0)] Y = Y.loc[np.where(clf.labels_ == 0)] X = sm.add_constant(X) # general OLS # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html # model=sm.OLS(Y, X.astype(float)) # robust regression # https://www.statsmodels.org/stable/generated/statsmodels.robust.robust_linear_model.RLM.html # model=sm.RLM(Y, X.astype(float))
sklearn_score_anomalies = clf.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] auc_svm_ws = evaluate.AUC(original_paper_score, y_test) # --- LOF --- # lof = LocalOutlierFactor(novelty=True) lof.fit(X_train) sklearn_score_anomalies = lof.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] auc_lof_ws = evaluate.AUC(original_paper_score, y_test) # --- LODA --- # aucs_loda_ws = np.zeros(num_of_experiments) for r in tqdm(range(num_of_experiments)): loda = LODA() loda.fit(X_train) y_pred_proba_loda = np.zeros(X_test.shape[0]) for i in tqdm(range(X_test.shape[0])): loda.fit(X_test[i, :].reshape(1, -1)) y_pred_proba_loda[i] = loda.decision_function(X_test[i, :].reshape( 1, -1)) aucs_loda_ws[r] = evaluate.AUC(1 - y_pred_proba_loda, y_test) auc_loda_ws = np.mean(aucs_loda_ws) # --- HalfSpaceTrees --- # aucs_hst_ws = np.zeros(num_of_experiments) for r in tqdm(range(num_of_experiments)): hst = HalfSpaceTrees(n_features=X_train_hst.shape[1], n_estimators=100) hst.fit(X_train_hst, np.zeros(X_train_hst.shape[0])) y_pred_proba_hst = np.zeros(X_test_hst.shape[0]) for i in tqdm(range(X_test_hst.shape[0])):