Example #1
0
        dist_k = self._distances_fit_X_[neighbors_indices,
                                        self.n_neighbors_ - 1]
        reach_dist_array = np.maximum(distances_X, dist_k)

        #  1e-10 to avoid `nan' when when nb of duplicates > n_neighbors_:
        return 1. / (np.mean(reach_dist_array, axis=1) + 1e-10)


# In[75]:

# predictor_keys = ['mean', 'expenses', 'sum', 'distance_traveled']
predictor_keys = ['expenses', 'distance_traveled']

model = LocalOutlierFactor(n_jobs=-1)
y = model.fit_predict(dataset_with_distances[predictor_keys])
model._predict([expected_abnormal_day[predictor_keys]])

# In[76]:

dataset_with_distances['lof_anomaly'] = y == -1
dataset_with_distances['lof_anomaly'].sum()

# In[77]:

sns.lmplot('expenses',
           'distance_traveled',
           data=dataset_with_distances,
           fit_reg=False,
           hue='lof_anomaly',
           scatter_kws={
               'marker': 'D',
Example #2
0
def AnomalyDetection(filepath):
	train_X = np.loadtxt(filepath+'normalized_train_file.csv', delimiter=',', dtype=float, skiprows=1)
	test_X = np.loadtxt(filepath+'pseudonormalized_test_file.csv', delimiter=',',dtype=float, skiprows=1)
	train_Y = np.loadtxt(filepath+'Y_train_file.csv', delimiter=',',dtype=float, skiprows=1)
	test_Y = np.loadtxt(filepath+'Y_test_file.csv', delimiter=',', dtype=float, skiprows=1)
	input_dimensions = str(train_X.shape[1]) #feature length
	samples_size =str(train_X.shape[0]) #number of rows
	input_dimensions_test = str(test_X.shape[1] )#feature length
	samples_size_test = str(test_X.shape[0]) #number of rows
	num_failed_train = train_Y[train_Y==1].shape[0]
	num_failed_test = test_Y[test_Y==1].shape[0]

	with open(filepath+'outliers_new_results.txt', 'w') as output:
		output.write("===== DATA INFORMATION =====\n")
		output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n')
		output.write('test data size: '  +samples_size_test +' by '+ input_dimensions_test+'\n')
		output.write('failed points in training: ' + str(num_failed_train))
		output.write('failed points in testing: ' + str(num_failed_test))

		#change input data for this method:
		training = train_X[np.where(train_Y==0)]
		testing = np.concatenate((test_X,train_X[np.where(train_Y==1)]))
		testing_Y =  np.concatenate((test_Y,train_Y[np.where(train_Y==1)]))
		input_dimensions = str(training.shape[1]) #feature length
		samples_size =str(training.shape[0]) #number of rows
		input_dimensions_test = str(testing.shape[1] )#feature length
		samples_size_test = str(testing.shape[0]) #number of rows
		#####################################################################
		# ONE CLASS SVM
		#####################################################################
		print()
		print('One Class SVM') # healthy data to train only
		print()

		output.write("\n===== ONE CLASS SVM =====\n")
		output.write("===== DATA INFORMATION FOR THIS METHOD 	=====\n")
		output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n')
		output.write('test data size: '  +samples_size_test +' by '+ input_dimensions_test+'\n')
		output.write('training set is all healthy data, testing set contains other data and all failed points\n')

		clf = svm.OneClassSVM(nu=0.15, kernel='rbf', gamma=0.75) # nu=0.15
		clf.fit(training)
		with open(filepath+'svm_one_class.pickle','wb') as f:
			pickle.dump(clf,f)
		y_pred_train = clf.predict(training)
		y_pred_test = clf.predict(testing)
		anomaly_detection_error(y_pred_train, train_Y[train_Y==0], "training", output, filepath+'OneClassSVM', OneClassSVMMethod=True)
		anomaly_detection_error(y_pred_test, testing_Y, "testing", output, filepath+'OneClassSVM', OneClassSVMMethod=True)

		#####################################################################
		# ISOLATION FOREST
		#####################################################################
		print()
		print('IsolationForest')
		print()

		output.write("\n===== ISOLATION FOREST =====\n")

		# Example settings
		n_samples = 100
		samples_max = 0.7336951612320737
		contamination_fraction = 0.11294048783176784

		clf = IsolationForest(n_estimators=n_samples,
								max_samples=samples_max,
								contamination=contamination_fraction,
								random_state=0)
		clf.fit(train_X)
		with open(filepath+'IsolationForest.pickle','wb') as f:
			pickle.dump(clf,f)
		y_pred_train = clf.predict(train_X)
		y_pred_test = clf.predict(test_X)
		anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'Isolation Forest')
		anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'Isolation Forest')
					
		#####################################################################
		# ELLIPTIC ENVELOPE
		#####################################################################
		print()
		print('Elliptic Envelope')
		print()

		output.write("\n===== ELLIPTIC ENVELOPE =====\n")

		clf = EllipticEnvelope(contamination=0.175, random_state=0)
		clf.fit(train_X)
		with open(filepath+'EllipticEnvelope.pickle','wb') as f:
			pickle.dump(clf,f)
		y_pred_train = clf.predict(train_X)
		y_pred_test = clf.predict(test_X)
		anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'EE')
		anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'EE')
		
		#####################################################################
		# LOCAL OUTLIER FACTOR
		#####################################################################
		print()
		print('Local Outlier Factor')
		print()

		output.write("\n=====LOCAL OUTLIER FACTOR =====\n'")

		for i in [100, 150, 200, 500, 1000]:
			clf = LocalOutlierFactor(n_neighbors=i, contamination=0.25)

			y_pred_train = clf.fit_predict(train_X)
			y_pred_test = clf._predict(test_X)
			anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'LOF')
			anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'LOF')
			with open('R:\\SMM-Structures\\A1-010391 (Navy IPMS data analytics)\\Technical\\Data\\datafiles\\'+'LOF {} neighbours.pickle'.format(i),'wb') as f:
				pickle.dump(clf,f)
		print()