Example #1
0
def nearest_neighbors(coords, k, D=None):
    '''
    inputs:
        coords: data coordinates in NF format
        k: Parameter MinPts, the k-nearest neighbors
        D: distance matrix, if not given, will use gen_dist_mat to generate one
    Outputs:
        NN_dists: k nearest neighbors distances matrix, np.array, (N by k)
        NN: k nearest neighbors matrix, np.array, (N by k)
            Contains the indices of coords, NOT the coordinates themselves
    '''
    import numpy as np

    if D is None:
        from scipy.spatial import distance
        from myFunctions import gen_dist_mat
        dist = distance.minkowski
        D = gen_dist_mat(coords, dist)

    N = D.shape[0]
    # initialize nearest neighbors
    NN_dists = np.empty((N, k), dtype=float)
    NN = np.empty((N, k), dtype=int)

    for i in range(N):
        # use numpy's structured array for sorting
        dtype = [('distance', float), ('index', int)]
        structure_dist = np.empty((N, ), dtype=dtype)
        structure_dist['distance'] = D[i]
        structure_dist['index'] = np.arange(N)
        structure_dist = np.sort(structure_dist, order='distance')

        # starts from 1 to remove itself, since the distance to itself is always 0
        NN_dists[i] = structure_dist['distance'][1:k + 1]
        NN[i] = structure_dist['index'][1:k + 1]

    return ([NN, NN_dists])
Example #2
0
def anomaly_detection(testdata_name,
                      rank_method_index,
                      test_EVs_ts,
                      test_MVs_ts,
                      fig_loc,
                      result_loc,
                      contam,
                      savefig_=True):
    '''
    Runs LOF and Isolation Forest for fault detection.
    Starts with using given rank function to group test_EVs_ts data to 
    weather_ts data, then compare MVs data with test_MVs_ts data using LOF and Isolation Forest
    
    -----------------------------------------------------------------------------
    global inputs:
        weather_ts: Divided TS weather data, numpy array in NT format
        MVs_ts: Corresponding divided TS MVs data, numpy array in NT format
        n_seg: number of segments for PAA conversion
    -----------------------------------------------------------------------------
    inputs:
        testdata_name: folder name of testing dataset, used to print out progress
        rank_method_index: index to identify rank method used
        test_EVs_ts: Divided TS EVs data, numpy array in NT format
        test_MVs_ts: Corresponding divided TS MVs data, numpy array in NT format
        fig_loc: folder path for saved faulty figure plots
        result_loc: folder path for fault detection rate result text files
        contam: contamination parameter used for scikit-learn anomaly detection algorithms
        savefig_: save figure if set to True, default is True
    outputs:
        Faulty TS is saved as a plot
        The fault detection rate of a dataset is saved in a text file
    '''
    # Local Outlier Factor
    from sklearn.neighbors import LocalOutlierFactor
    from myFunctions import gen_dist_mat

    #
    experimentName = '{}_LOF'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]

    test_weather_ts = test_EVs_ts[0]  # test weather data

    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,
                                       test_weather_ts[n])['Day'][:30]

            print('{} - group length:{}'.format(n, len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue

            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index, n].reshape(
                (1, MVs_ts[MV_index, weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate(
                (MVs_ts[MV_index, weather_group], test_data_point), axis=0)

            LOF = LocalOutlierFactor(n_neighbors=10,
                                     metric='precomputed',
                                     contamination=contam)
            D = gen_dist_mat(NT_data)  # distance matrix

            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0] * D.shape[1]:
                predictions.append('D=0')
                continue

            pred = LOF.fit_predict(D)
            predictions.append(
                str(pred[-1])
            )  # change to string to avoid comparison error in numpy later

            # if detected as outlier, save plot of MVs
            if pred[-1] == -1 and savefig_:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index, c],
                             color='steelblue',
                             alpha=0.5,
                             linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index, n], color='gold')
                #--------------------------------

                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------

                # dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                dir_loc = fig_loc + r'\{}\{}\{}'.format(
                    rank_method_name, experimentName, MVs[MV_index])

                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()

        MV_predictions.append(np.array(predictions))

    p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool)  # faulty
    p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool)  # normal
    p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool)  # lack of data
    p_fault[:] = False
    p_normal[:] = True  # False
    p_lack[:] = True  # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions == '-1')
        normal_with_identical = np.logical_or(predictions == '1',
                                              predictions == 'D=0')
        p_normal = np.logical_and(p_normal, normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions == 'len<')

    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]

    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(
        len(fault_index) / test_weather_ts.shape[0] * 100)
    nd_rate = 'Normal operation rate:\t {}%'.format(
        len(normal_index) / test_weather_ts.shape[0] * 100)
    ld_rate = 'Lack of data rate:\t {}%'.format(
        len(lack_index) / test_weather_ts.shape[0] * 100)

    print(fd_rate)
    print(nd_rate)
    print(ld_rate)

    # Save results:
    # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName)

    # check directory if exists
    if not os.path.exists(dir_loc):
        os.makedirs(dir_loc)

    with open(dir_loc + '\\results.txt', 'w') as f:
        f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate)

    # save prediction results
    predArr_lof = np.array(
        MV_predictions).T  # NF format(row:day/sample, col:MV)
    header = np.array(MVs).reshape(1, len(MVs))  # add header
    predArr_lof = np.concatenate((header, predArr_lof), axis=0)
    np.savetxt(dir_loc + '\\MV_predictions.csv',
               predArr_lof,
               fmt='%s',
               delimiter=',')

    # Isolation Forest

    from sklearn.ensemble import IsolationForest
    from myFunctions import gen_dist_mat

    #
    experimentName = '{}_IsolationForest'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]

    # test_weather_ts = test_EVs_ts[0] # test weather data

    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,
                                       test_weather_ts[n])['Day'][:30]

            print('{} - group length:{}'.format(n, len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue

            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index, n].reshape(
                (1, MVs_ts[MV_index, weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate(
                (MVs_ts[MV_index, weather_group], test_data_point), axis=0)

            D = gen_dist_mat(NT_data)  # distance matrix

            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0] * D.shape[1]:
                predictions.append('D=0')
                continue

            IsoForest = IsolationForest(contamination=contam)
            IsoForest.fit(NT_data)
            pred = IsoForest.predict(NT_data)

            predictions.append(
                str(pred[-1])
            )  # change to string to avoid comparison error in numpy later

            # if detected as outlier, save plot of MVs
            if pred[-1] == -1 and savefig_:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index, c],
                             color='steelblue',
                             alpha=0.5,
                             linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index, n], color='gold')
                #--------------------------------

                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------

                # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                dir_loc = fig_loc + r'\{}\{}\{}'.format(
                    rank_method_name, experimentName, MVs[MV_index])

                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()

        MV_predictions.append(np.array(predictions))

    p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool)  # faulty
    p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool)  # normal
    p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool)  # lack of data
    p_fault[:] = False
    p_normal[:] = True  # False
    p_lack[:] = True  # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions == '-1')
        normal_with_identical = np.logical_or(predictions == '1',
                                              predictions == 'D=0')
        p_normal = np.logical_and(p_normal, normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions == 'len<')

    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]

    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(
        len(fault_index) / test_weather_ts.shape[0] * 100)
    nd_rate = 'Normal operation rate:\t {}%'.format(
        len(normal_index) / test_weather_ts.shape[0] * 100)
    ld_rate = 'Lack of data rate:\t {}%'.format(
        len(lack_index) / test_weather_ts.shape[0] * 100)

    print(fd_rate)
    print(nd_rate)
    print(ld_rate)

    # Save results:
    # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName)

    # check directory if exists
    if not os.path.exists(dir_loc):
        os.makedirs(dir_loc)

    with open(dir_loc + '\\results.txt', 'w') as f:
        f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate)

    # save prediction results
    predArr_iForest = np.array(
        MV_predictions).T  # NF format(row:day/sample, col:MV)
    header = np.array(MVs).reshape(1, len(MVs))  # add header
    predArr_iForest = np.concatenate((header, predArr_iForest), axis=0)
    np.savetxt(dir_loc + '\\MV_predictions.csv',
               predArr_iForest,
               fmt='%s',
               delimiter=',')
    # return prediction results
    return (predArr_lof, predArr_iForest)
Example #3
0
def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts):
    # Local Outlier Factor
    from sklearn.neighbors import LocalOutlierFactor
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_LOF'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
    
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed')
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
                
            pred = LOF.fit_predict(D)
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
    
    
    
    
    
    
    
    # Isolation Forest
    
    from sklearn.ensemble import IsolationForest
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_IsolationForest'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    # test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
            
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
            
            IsoForest = IsolationForest()
            IsoForest.fit(NT_data)
            pred = IsoForest.predict(NT_data)    
            
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
Example #4
0
     predictions.append('len<')
     continue
 
 # # Training MVs data
 # MVs_ts[MV_index,combination]
 # # Test MVs data
 # test_MVs_ts[MV_index,n]
 
 
 # reshape to row array to concatenate
 test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
 # concatenated matrix of training data and the test data sample
 NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
 
 LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed')
 D = gen_dist_mat(NT_data) # distance matrix
 
 # if distance matrix are all zeros(all TS are identical), then skip this
 if len(D[D == 0]) == D.shape[0]*D.shape[1]:
     predictions.append('D=0')
     continue
     
 pred = LOF.fit_predict(D)
 predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
 
 # if detected as outlier, save plot of MVs
 if pred[-1] == -1:
     plt.figure()
     # # draw only the current MV-----
     for c in weather_group:
         plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
Example #5
0
plt.ylabel('Heat load[W]')
plt.legend()
plt.show()

# weekends+holidays when N = 2
np.arange(kmeans.labels_.shape[0])[kmeans.labels_ == 1]

## Compare with DBSCAN
# Get this current script file's directory:
loc = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
# Set working directory
os.chdir(loc)
from myFunctions import gen_dist_mat, k_dist
from scipy.spatial import distance
dist = distance.minkowski
D = gen_dist_mat(heatload, dist)

# '''
# inputs:
#     D: distance matrix(N by N)
#     k: k-th neighbor distance
# '''
# def k_dist(D,k = 4):
#     import numpy as np
#     D = np.array(D)
#     N = D.shape[0]
#     # initialize k_dist vector
#     k_dist = np.zeros((N,1))
#     for i in range(N):
#         row = list(D[i,:])
#         for j in range(k): # remove min(row) k times, not k-1 times, because closest is always itself!