def nearest_neighbors(coords, k, D=None): ''' inputs: coords: data coordinates in NF format k: Parameter MinPts, the k-nearest neighbors D: distance matrix, if not given, will use gen_dist_mat to generate one Outputs: NN_dists: k nearest neighbors distances matrix, np.array, (N by k) NN: k nearest neighbors matrix, np.array, (N by k) Contains the indices of coords, NOT the coordinates themselves ''' import numpy as np if D is None: from scipy.spatial import distance from myFunctions import gen_dist_mat dist = distance.minkowski D = gen_dist_mat(coords, dist) N = D.shape[0] # initialize nearest neighbors NN_dists = np.empty((N, k), dtype=float) NN = np.empty((N, k), dtype=int) for i in range(N): # use numpy's structured array for sorting dtype = [('distance', float), ('index', int)] structure_dist = np.empty((N, ), dtype=dtype) structure_dist['distance'] = D[i] structure_dist['index'] = np.arange(N) structure_dist = np.sort(structure_dist, order='distance') # starts from 1 to remove itself, since the distance to itself is always 0 NN_dists[i] = structure_dist['distance'][1:k + 1] NN[i] = structure_dist['index'][1:k + 1] return ([NN, NN_dists])
def anomaly_detection(testdata_name, rank_method_index, test_EVs_ts, test_MVs_ts, fig_loc, result_loc, contam, savefig_=True): ''' Runs LOF and Isolation Forest for fault detection. Starts with using given rank function to group test_EVs_ts data to weather_ts data, then compare MVs data with test_MVs_ts data using LOF and Isolation Forest ----------------------------------------------------------------------------- global inputs: weather_ts: Divided TS weather data, numpy array in NT format MVs_ts: Corresponding divided TS MVs data, numpy array in NT format n_seg: number of segments for PAA conversion ----------------------------------------------------------------------------- inputs: testdata_name: folder name of testing dataset, used to print out progress rank_method_index: index to identify rank method used test_EVs_ts: Divided TS EVs data, numpy array in NT format test_MVs_ts: Corresponding divided TS MVs data, numpy array in NT format fig_loc: folder path for saved faulty figure plots result_loc: folder path for fault detection rate result text files contam: contamination parameter used for scikit-learn anomaly detection algorithms savefig_: save figure if set to True, default is True outputs: Faulty TS is saved as a plot The fault detection rate of a dataset is saved in a text file ''' # Local Outlier Factor from sklearn.neighbors import LocalOutlierFactor from myFunctions import gen_dist_mat # experimentName = '{}_LOF'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts, test_weather_ts[n])['Day'][:30] print('{} - group length:{}'.format(n, len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index, n].reshape( (1, MVs_ts[MV_index, weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate( (MVs_ts[MV_index, weather_group], test_data_point), axis=0) LOF = LocalOutlierFactor(n_neighbors=10, metric='precomputed', contamination=contam) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0] * D.shape[1]: predictions.append('D=0') continue pred = LOF.fit_predict(D) predictions.append( str(pred[-1]) ) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1 and savefig_: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index, c], color='steelblue', alpha=0.5, linestyle='dotted') plt.plot(test_MVs_ts[MV_index, n], color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- # dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) dir_loc = fig_loc + r'\{}\{}\{}'.format( rank_method_name, experimentName, MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool) # normal p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions == '-1') normal_with_identical = np.logical_or(predictions == '1', predictions == 'D=0') p_normal = np.logical_and(p_normal, normal_with_identical) p_lack = np.logical_and(p_lack, predictions == 'len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format( len(fault_index) / test_weather_ts.shape[0] * 100) nd_rate = 'Normal operation rate:\t {}%'.format( len(normal_index) / test_weather_ts.shape[0] * 100) ld_rate = 'Lack of data rate:\t {}%'.format( len(lack_index) / test_weather_ts.shape[0] * 100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) with open(dir_loc + '\\results.txt', 'w') as f: f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate) # save prediction results predArr_lof = np.array( MV_predictions).T # NF format(row:day/sample, col:MV) header = np.array(MVs).reshape(1, len(MVs)) # add header predArr_lof = np.concatenate((header, predArr_lof), axis=0) np.savetxt(dir_loc + '\\MV_predictions.csv', predArr_lof, fmt='%s', delimiter=',') # Isolation Forest from sklearn.ensemble import IsolationForest from myFunctions import gen_dist_mat # experimentName = '{}_IsolationForest'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] # test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts, test_weather_ts[n])['Day'][:30] print('{} - group length:{}'.format(n, len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index, n].reshape( (1, MVs_ts[MV_index, weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate( (MVs_ts[MV_index, weather_group], test_data_point), axis=0) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0] * D.shape[1]: predictions.append('D=0') continue IsoForest = IsolationForest(contamination=contam) IsoForest.fit(NT_data) pred = IsoForest.predict(NT_data) predictions.append( str(pred[-1]) ) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1 and savefig_: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index, c], color='steelblue', alpha=0.5, linestyle='dotted') plt.plot(test_MVs_ts[MV_index, n], color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) dir_loc = fig_loc + r'\{}\{}\{}'.format( rank_method_name, experimentName, MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool) # normal p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions == '-1') normal_with_identical = np.logical_or(predictions == '1', predictions == 'D=0') p_normal = np.logical_and(p_normal, normal_with_identical) p_lack = np.logical_and(p_lack, predictions == 'len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format( len(fault_index) / test_weather_ts.shape[0] * 100) nd_rate = 'Normal operation rate:\t {}%'.format( len(normal_index) / test_weather_ts.shape[0] * 100) ld_rate = 'Lack of data rate:\t {}%'.format( len(lack_index) / test_weather_ts.shape[0] * 100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) with open(dir_loc + '\\results.txt', 'w') as f: f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate) # save prediction results predArr_iForest = np.array( MV_predictions).T # NF format(row:day/sample, col:MV) header = np.array(MVs).reshape(1, len(MVs)) # add header predArr_iForest = np.concatenate((header, predArr_iForest), axis=0) np.savetxt(dir_loc + '\\MV_predictions.csv', predArr_iForest, fmt='%s', delimiter=',') # return prediction results return (predArr_lof, predArr_iForest)
def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts): # Local Outlier Factor from sklearn.neighbors import LocalOutlierFactor from myFunctions import gen_dist_mat # experimentName = '{}_LOF'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed') D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue pred = LOF.fit_predict(D) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate) # Isolation Forest from sklearn.ensemble import IsolationForest from myFunctions import gen_dist_mat # experimentName = '{}_IsolationForest'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] # test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue IsoForest = IsolationForest() IsoForest.fit(NT_data) pred = IsoForest.predict(NT_data) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
predictions.append('len<') continue # # Training MVs data # MVs_ts[MV_index,combination] # # Test MVs data # test_MVs_ts[MV_index,n] # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed') D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue pred = LOF.fit_predict(D) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
plt.ylabel('Heat load[W]') plt.legend() plt.show() # weekends+holidays when N = 2 np.arange(kmeans.labels_.shape[0])[kmeans.labels_ == 1] ## Compare with DBSCAN # Get this current script file's directory: loc = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) # Set working directory os.chdir(loc) from myFunctions import gen_dist_mat, k_dist from scipy.spatial import distance dist = distance.minkowski D = gen_dist_mat(heatload, dist) # ''' # inputs: # D: distance matrix(N by N) # k: k-th neighbor distance # ''' # def k_dist(D,k = 4): # import numpy as np # D = np.array(D) # N = D.shape[0] # # initialize k_dist vector # k_dist = np.zeros((N,1)) # for i in range(N): # row = list(D[i,:]) # for j in range(k): # remove min(row) k times, not k-1 times, because closest is always itself!