dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1] reach_dist_array = np.maximum(distances_X, dist_k) # 1e-10 to avoid `nan' when when nb of duplicates > n_neighbors_: return 1. / (np.mean(reach_dist_array, axis=1) + 1e-10) # In[75]: # predictor_keys = ['mean', 'expenses', 'sum', 'distance_traveled'] predictor_keys = ['expenses', 'distance_traveled'] model = LocalOutlierFactor(n_jobs=-1) y = model.fit_predict(dataset_with_distances[predictor_keys]) model._predict([expected_abnormal_day[predictor_keys]]) # In[76]: dataset_with_distances['lof_anomaly'] = y == -1 dataset_with_distances['lof_anomaly'].sum() # In[77]: sns.lmplot('expenses', 'distance_traveled', data=dataset_with_distances, fit_reg=False, hue='lof_anomaly', scatter_kws={ 'marker': 'D',
def AnomalyDetection(filepath): train_X = np.loadtxt(filepath+'normalized_train_file.csv', delimiter=',', dtype=float, skiprows=1) test_X = np.loadtxt(filepath+'pseudonormalized_test_file.csv', delimiter=',',dtype=float, skiprows=1) train_Y = np.loadtxt(filepath+'Y_train_file.csv', delimiter=',',dtype=float, skiprows=1) test_Y = np.loadtxt(filepath+'Y_test_file.csv', delimiter=',', dtype=float, skiprows=1) input_dimensions = str(train_X.shape[1]) #feature length samples_size =str(train_X.shape[0]) #number of rows input_dimensions_test = str(test_X.shape[1] )#feature length samples_size_test = str(test_X.shape[0]) #number of rows num_failed_train = train_Y[train_Y==1].shape[0] num_failed_test = test_Y[test_Y==1].shape[0] with open(filepath+'outliers_new_results.txt', 'w') as output: output.write("===== DATA INFORMATION =====\n") output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n') output.write('test data size: ' +samples_size_test +' by '+ input_dimensions_test+'\n') output.write('failed points in training: ' + str(num_failed_train)) output.write('failed points in testing: ' + str(num_failed_test)) #change input data for this method: training = train_X[np.where(train_Y==0)] testing = np.concatenate((test_X,train_X[np.where(train_Y==1)])) testing_Y = np.concatenate((test_Y,train_Y[np.where(train_Y==1)])) input_dimensions = str(training.shape[1]) #feature length samples_size =str(training.shape[0]) #number of rows input_dimensions_test = str(testing.shape[1] )#feature length samples_size_test = str(testing.shape[0]) #number of rows ##################################################################### # ONE CLASS SVM ##################################################################### print() print('One Class SVM') # healthy data to train only print() output.write("\n===== ONE CLASS SVM =====\n") output.write("===== DATA INFORMATION FOR THIS METHOD =====\n") output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n') output.write('test data size: ' +samples_size_test +' by '+ input_dimensions_test+'\n') output.write('training set is all healthy data, testing set contains other data and all failed points\n') clf = svm.OneClassSVM(nu=0.15, kernel='rbf', gamma=0.75) # nu=0.15 clf.fit(training) with open(filepath+'svm_one_class.pickle','wb') as f: pickle.dump(clf,f) y_pred_train = clf.predict(training) y_pred_test = clf.predict(testing) anomaly_detection_error(y_pred_train, train_Y[train_Y==0], "training", output, filepath+'OneClassSVM', OneClassSVMMethod=True) anomaly_detection_error(y_pred_test, testing_Y, "testing", output, filepath+'OneClassSVM', OneClassSVMMethod=True) ##################################################################### # ISOLATION FOREST ##################################################################### print() print('IsolationForest') print() output.write("\n===== ISOLATION FOREST =====\n") # Example settings n_samples = 100 samples_max = 0.7336951612320737 contamination_fraction = 0.11294048783176784 clf = IsolationForest(n_estimators=n_samples, max_samples=samples_max, contamination=contamination_fraction, random_state=0) clf.fit(train_X) with open(filepath+'IsolationForest.pickle','wb') as f: pickle.dump(clf,f) y_pred_train = clf.predict(train_X) y_pred_test = clf.predict(test_X) anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'Isolation Forest') anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'Isolation Forest') ##################################################################### # ELLIPTIC ENVELOPE ##################################################################### print() print('Elliptic Envelope') print() output.write("\n===== ELLIPTIC ENVELOPE =====\n") clf = EllipticEnvelope(contamination=0.175, random_state=0) clf.fit(train_X) with open(filepath+'EllipticEnvelope.pickle','wb') as f: pickle.dump(clf,f) y_pred_train = clf.predict(train_X) y_pred_test = clf.predict(test_X) anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'EE') anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'EE') ##################################################################### # LOCAL OUTLIER FACTOR ##################################################################### print() print('Local Outlier Factor') print() output.write("\n=====LOCAL OUTLIER FACTOR =====\n'") for i in [100, 150, 200, 500, 1000]: clf = LocalOutlierFactor(n_neighbors=i, contamination=0.25) y_pred_train = clf.fit_predict(train_X) y_pred_test = clf._predict(test_X) anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'LOF') anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'LOF') with open('R:\\SMM-Structures\\A1-010391 (Navy IPMS data analytics)\\Technical\\Data\\datafiles\\'+'LOF {} neighbours.pickle'.format(i),'wb') as f: pickle.dump(clf,f) print()