def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LMDD(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train)
def test_check_parameters(self): with assert_raises(ValueError): LMDD(contamination=10.) with assert_raises(ValueError): LMDD(dis_measure='unknown') with assert_raises(TypeError): LMDD(dis_measure=5) with assert_raises(TypeError): LMDD(n_iter='not int') with assert_raises(ValueError): LMDD(n_iter=-1) with assert_raises(ValueError): LMDD(random_state='not valid') with assert_raises(ValueError): LMDD(random_state=-1)
class TestCOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LMDD(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): # check_estimator(self.clf) pass def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, 'dis_measure_') and self.clf.dis_measure_ is not None) assert (hasattr(self.clf, 'n_iter_') and self.clf.n_iter_ is not None) assert (hasattr(self.clf, 'random_state_') and self.clf.random_state_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_check_parameters(self): with assert_raises(ValueError): LMDD(contamination=10.) with assert_raises(ValueError): LMDD(dis_measure='unknown') with assert_raises(TypeError): LMDD(dis_measure=5) with assert_raises(TypeError): LMDD(n_iter='not int') with assert_raises(ValueError): LMDD(n_iter=-1) with assert_raises(ValueError): LMDD(random_state='not valid') with assert_raises(ValueError): LMDD(random_state=-1) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
lr_d=0.01, lr_g=0.0001, decay=1e-06, momentum=0.9, contamination=0.1), 'MO_GAAL'), # SO_GAAL pyod (SO_GAAL(stop_epochs=20, lr_d=0.01, lr_g=0.0001, decay=1e-06, momentum=0.9, contamination=0.1), 'SO_GAAL'), # OCKRA github (m_ockra.m_OCKRA(), 'OCKRA'), # VAR LMDD pyOD (LMDD(dis_measure='var', random_state=rs), 'VAR_LMDD'), # LOCI pyod (LSCP(detector_list, local_region_size=30, local_max_features=1.0, n_bins=10, random_state=None, contamination=0.1), 'LSCP') ] # Select the model location with i to run i = 8 had_error = [] # Initialize the class anomaly #for i in range(1,8): # try:
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train LMDD detector clf_name = 'LMDD' clf = LMDD(random_state=42) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def initialise_pyod_classifiers(self, outlier_fraction): #Testing every query to every class and then predicting only if it belongs to the same class classifiers = {} #Proximity based classifiers['K Nearest Neighbors (KNN)'] = [] classifiers['Average K Nearest Neighbors (AvgKNN)'] = [] classifiers['Median K Nearest Neighbors (MedKNN)'] = [] classifiers['Local Outlier Factor (LOF)'] = [] classifiers['Connectivity-Based Outlier Factor (COF)'] = [] #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = [] classifiers['LOCI'] = [] #classifiers['Histogram-based Outlier Score (HBOS)'] = [] classifiers['Subspace Outlier Detection (SOD)'] = [] #Linear models classifiers['Principal Component Analysis (PCA)'] = [] #classifiers['Minimum Covariance Determinant (MCD)'] = [] #To slow classifiers['One-Class Support Vector Machines (OCSVM)'] = [] classifiers['Deviation-based Outlier Detection (LMDD)'] = [] #Probabilistic classifiers['Angle-Based Outlier Detection (ABOD)'] = [] classifiers['Stochastic Outlier Selection (SOS)'] = [] #Outlier Ensembles classifiers['Isolation Forest (IForest)'] = [] classifiers['Feature Bagging'] = [] classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = [] for i in range(self.k_way): for i in range(self.k_way): classifiers['K Nearest Neighbors (KNN)'].append( KNN(method='largest', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Average K Nearest Neighbors (AvgKNN)'].append( KNN(method='mean', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Median K Nearest Neighbors (MedKNN)'].append( KNN(method='median', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Local Outlier Factor (LOF)'].append( LOF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Connectivity-Based Outlier Factor (COF)'].append( COF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['LOCI'].append( LOCI(contamination=outlier_fraction)) classifiers['Subspace Outlier Detection (SOD)'].append( SOD(n_neighbors=int(self.n_shot / 3) + 2, contamination=outlier_fraction, ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3)))) classifiers['Principal Component Analysis (PCA)'].append( PCA(contamination=outlier_fraction)) classifiers[ 'One-Class Support Vector Machines (OCSVM)'].append( OCSVM(contamination=outlier_fraction)) classifiers['Deviation-based Outlier Detection (LMDD)'].append( LMDD(contamination=outlier_fraction)) classifiers['Angle-Based Outlier Detection (ABOD)'].append( ABOD(contamination=outlier_fraction)) classifiers['Stochastic Outlier Selection (SOS)'].append( SOS(contamination=outlier_fraction)) classifiers['Isolation Forest (IForest)'].append( IForest(contamination=outlier_fraction)) classifiers['Feature Bagging'].append( FeatureBagging(contamination=outlier_fraction)) classifiers[ 'Lightweight On-line Detector of Anomalies (LODA)'].append( LODA(contamination=outlier_fraction)) self.num_different_models = len(classifiers) return classifiers
def pyod_init(model, n_features=None): # initial model set up if model == 'abod': from pyod.models.abod import ABOD clf = ABOD() elif model == 'auto_encoder' and n_features: #import os #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from pyod.models.auto_encoder import AutoEncoder clf = AutoEncoder(hidden_neurons=[ n_features, n_features * 5, n_features * 5, n_features ], epochs=5, batch_size=64, preprocessing=False) elif model == 'cblof': from pyod.models.cblof import CBLOF clf = CBLOF(n_clusters=4) elif model == 'hbos': from pyod.models.hbos import HBOS clf = HBOS() elif model == 'iforest': from pyod.models.iforest import IForest clf = IForest() elif model == 'knn': from pyod.models.knn import KNN clf = KNN() elif model == 'lmdd': from pyod.models.lmdd import LMDD clf = LMDD() elif model == 'loci': from pyod.models.loci import LOCI clf = LOCI() elif model == 'loda': from pyod.models.loda import LODA clf = LODA() elif model == 'lof': from pyod.models.lof import LOF clf = LOF() elif model == 'mcd': from pyod.models.mcd import MCD clf = MCD() elif model == 'ocsvm': from pyod.models.ocsvm import OCSVM clf = OCSVM() elif model == 'pca': from pyod.models.pca import PCA clf = PCA() elif model == 'sod': from pyod.models.sod import SOD clf = SOD() elif model == 'vae': from pyod.models.vae import VAE clf = VAE() elif model == 'xgbod': from pyod.models.xgbod import XGBOD clf = XGBOD() else: #raise ValueError(f"unknown model {model}") clf = PyODDefaultModel() return clf
'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'IForest': IForest(), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), #'MoGaal':MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]) } models = { 'XGBOD': XGBOD(), 'BRM': BRM(), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(),
def outlier_detection(df): testing_df = df[(df['Chassis_Number'] == 'WBA1C11080J829552')] # testing_df = df[(df['Chassis_Number'] == 'VF3LCYHZPJS332137')] clf = LOF( n_neighbors=10, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_lof'] = y_pred clf = LMDD( n_iter=100, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_lmdd'] = y_pred clf = IsolationForest( n_estimators=100, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_isolation_forest'] = y_pred clf = KNN( method='mean', n_neighbors=3, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_knn_mean'] = y_pred clf = KNN( method='median', n_neighbors=3, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_knn_median'] = y_pred print(testing_df[['Movement_Date', 'Kms', 'Kms_diff', 'outlier_score_lof', 'outlier_score_lmdd', 'outlier_score_isolation_forest', 'outlier_score_knn_mean', 'outlier_score_knn_median']]) return
df.to_csv(self.model_name + '_results.csv', index=False) print('\nFinished ' + self.model_name) return None if __name__ == '__main__': # Specify the root directory rootDir = 'G:/My Drive/Github/ml-group-col/One-Class-models/Anomaly_Datasets_csv/' # specify the random state rs = 10 # Save how to run the models models = [ (IsolationForest(random_state=rs), 'ISOF'), (EllipticEnvelope(random_state=rs), 'EE'), (LMDD(dis_measure='aad', random_state=rs), 'AAD_LMDD'), (COPOD(), 'COPOD'), (FeatureBagging(combination='average', random_state=rs), 'AVE_Bagging'), # n_jobs (LMDD(dis_measure='iqr', random_state=rs), 'IQR_LMDD'), (KNN(method='largest'), 'Largest_KNN'), # n_jobs (LODA(), 'LODA'), (FeatureBagging(combination='max', n_jobs=-1, random_state=rs), 'MAX_Bagging'), (MCD(random_state=rs), 'MCD'), (XGBOD(random_state=rs), 'XGBOD'), # n_jobs (GaussianMixture(random_state=rs), 'GMM'), (LocalOutlierFactor(novelty=True), 'LOF'), (KNN(method='median'), 'Median_KNN'), # n_jobs (KNN(method='mean'), 'Avg_KNN'), # n_jobs (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'),