Beispiel #1
0
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv',
                           header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :],
               model_dir_path=model_dir_path,
               estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' +
              ('abnormal' if is_anomaly else 'normal') + ' (dist: ' +
              str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ground_anomaly.csv')
    ecg_data = ecg_data[1:]
    # print([name for name in ecg_data.columns])
    ecg_data=ecg_data.drop(['TIMESTAMP', 'RECORD', 'AmbTemp_C_Avg', 'InvPAC_kW_Avg', 'PwrMtrP_kW_Avg'], axis=1)
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)

    ae = LstmAutoEncoder()
    print(ecg_data.shape)
    column = ecg_data.shape[0]
    print(column)

    # fit the data and save model into model_dir_path
    ae.fit(ecg_np_data[:10000, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.95)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:10000, :])
    reconstruction_error = []
    abnormal_number = 0
    idx_list = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        if is_anomaly:
            abnormal_number = abnormal_number + 1
            print(idx)
            idx_list.append(idx)
            print('# ' + str(idx) + ' is abnormal.')
        reconstruction_error.append(dist)
    print(abnormal_number)
    print(idx_list)
    visualize_reconstruction_error(reconstruction_error, ae.threshold)
Beispiel #3
0
def main_test(db_file_name, sub_output_dir, COLUM_LIST, ERROR_LIST):
    np.random.seed(RANDOM_SEED)

    if os.path.exists(sub_output_dir):
        print(sub_output_dir + 'Already exist')
    else:
        create_directory(sub_output_dir)

    info.save_infomation(sub_output_dir, date_str, COLUM_LIST, ERROR_LIST)
    X_data, Y_data = data.create_data(db_file_name,
                                      sub_output_dir,
                                      COLUM_LIST,
                                      ERROR_LIST,
                                      is_di=True)
    #data_dir_path = './data'
    #model_dir_path = './models'
    #datasets_dict = data.load_data(sub_output_dir)
    #X_data = datasets_dict['anomaly'][0]
    #Y_data = datasets_dict['anomaly'][1]

    X_len = len(X_data)
    for i in range(X_len):
        x_ = X_data[0]
        y_ = Y_data[0]
        scaler = MinMaxScaler()
        X_scaler_data = scaler.fit_transform(x_)

        model_name = 'LstmAutoEncoder'
        ae = LstmAutoEncoder()
        AutoEncoder_test(X_scaler_data, y_, sub_output_dir, i, model_name, ae,
                         ERROR_LIST)

        model_name = 'CnnLstmAutoEncoder'
        ae = CnnLstmAutoEncoder()
        AutoEncoder_test(X_scaler_data, y_, sub_output_dir, i, model_name, ae,
                         ERROR_LIST)

        model_name = 'Conv1DAutoEncoder'
        ae = Conv1DAutoEncoder()
        AutoEncoder_test(X_scaler_data, y_, sub_output_dir, i, model_name, ae,
                         ERROR_LIST)

        model_name = 'BidirectionalLstmAutoEncoder'
        ae = BidirectionalLstmAutoEncoder()
        AutoEncoder_test(X_scaler_data, y_, sub_output_dir, i, model_name, ae,
                         ERROR_LIST)

        model_name = 'FeedForwardAutoEncoder'
        ae = FeedForwardAutoEncoder()
        AutoEncoder_test(X_scaler_data, y_, sub_output_dir, i, model_name, ae,
                         ERROR_LIST)
    info.save_information_done(sub_output_dir)
def main():

#    data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data'
    data_dir_path = '/Users/Shariful/Documents/DataCamp/ADFA-LD(tf-idf)'
#    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models'
    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/adfa_demo/models'

#    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
#    ecg_data1 = pd.read_csv(data_dir_path + '/test_normal.csv', skiprows=1, \
#                           index_col=None, header=None)
    ecg_data2 = pd.read_csv(data_dir_path + '/train_normal.csv', skiprows=1, \
                           index_col=None, header=None)
    ecg_data3 = pd.read_csv(data_dir_path + '/test_attack.csv', skiprows=1, \
                           index_col=None, header=None)
#    ecg_data1 = ecg_data1.iloc[:, 0:-1]
    ecg_data2 = ecg_data2.iloc[:, 0:-1]
    ecg_data3 = ecg_data3.iloc[:, 0:-1]
    
    ecg_data = pd.concat([ecg_data2, ecg_data3], ignore_index=True)
    
#    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    ae.fit(ecg_np_data[0:832, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
#    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    anomaly_information = ae.anomaly(ecg_np_data, threshold=1.75)
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold)
Beispiel #5
0
def lstmnn(inputfile, weekday, lanedirection, hourfrom, hourto):
    # @app.route('/lstmrnn/<inputfile>/<day>/<int:lanedirection>/<int:hourfrom>/<int:hourto>', methods=['GET'])
    # def lstmnn(inputfile,day,lanedirection,hourfrom,hourto):
    begin = time.perf_counter()
    data_dir_path = './datalake'
    model_dir_path = './models'
    ##df = pd.read_csv(data_dir_path + '/Nov2012Dec2014trafficmatches.csv')
    # df = pd.read_csv(data_dir_path + '/Nov2012Dec2017trafficmatch.csv')
    df = pd.read_csv(data_dir_path + '/' + inputfile)
    ##print(df.head())
    dat = df.loc[(df['Week'] == weekday)
                 & (df['LaneDirection'] == lanedirection) &
                 (df['Hour'] >= hourfrom) & (df['Hour'] <= hourto)]
    # dat=df.loc[(df['DayName']==day) & (df['LaneDirection']==lanedirection) & (df['Hour'] >= hourfrom) & (df['Hour'] <= hourto)]
    # dat=df.loc[(df['Week']=='Weekdays') & (df['DirectionDescription']=='South') & (df['Hour'] >= 21) & (df['Hour'] <= 23)]
    ## dat=df.loc[(df['Week']==weekday) & (df['DirectionDescription']==direction) & (df['Hour'] >= hourfrom) & (df['Hour'] <= hourto)]
    dat.insert(0, 'Row', range(0, 0 + len(dat)))
    dat = dat[[
        'Row', 'Sdate', 'DayName', 'LaneNumber', 'DirectionDescription',
        'Volume', 'AvgSpeed', 'Outlier'
    ]]
    print(dat)
    ##traffic_data = pd.read_csv(data_dir_path + '/test_data.csv', header=None)
    ##traffic_data = pd.read_csv(data_dir_path + '/test_south.csv', header=None)
    traffic_data = dat[['Volume']]
    print(traffic_data.head())
    traffic_np_data = traffic_data.values
    scaler = MinMaxScaler()
    traffic_np_data = scaler.fit_transform(traffic_np_data)
    print(traffic_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    ae.fit(traffic_np_data[:, :],
           model_dir_path=model_dir_path,
           estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(traffic_np_data)
    reconstruction_error = []

    # new dataframe to store idk and anomaly
    colnames = ['Row', 'OutlierPrediction']
    df2 = pd.DataFrame(columns=colnames)

    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' +
              ('abnormal' if is_anomaly else 'normal') + ' (dist: ' +
              str(dist) + ')')
        df2.loc[len(df2)] = [idx, ('abnormal' if is_anomaly else 'normal')]
        #if is_anomaly :
        #df2.loc[len(df2)] = [idx, 'abnormal']
        #else:
        #pass
        reconstruction_error.append(dist)
    #print(df2)

    dat['OutlierPrediction'] = np.where(df2['OutlierPrediction'] == 'abnormal',
                                        1, 0)
    df3 = dat.loc[dat['OutlierPrediction'] == 1]
    print(df3)

    tn, fp, fn, tp = confusion_matrix(dat['Outlier'].values,
                                      dat['OutlierPrediction'].values).ravel()
    sensitivity = tp / (tp + fn)
    #sensitivity=recall_score(df['Outlier'], df['OutlierPrediction'], average='weighted')
    specificity = tn / (fp + tn)
    BalancedAccuracy = (sensitivity + specificity) / 2
    #FPRate = fp/(fp+tn)
    end = time.perf_counter() - begin
    print('Balanced Accuracy=%.2f' % (BalancedAccuracy))

    # visualize_reconstruction_error(reconstruction_error, ae.threshold)

    #plot
    img = io.BytesIO()
    pyplot.plot(reconstruction_error,
                marker='o',
                ms=3.5,
                linestyle='',
                label='Point')
    pyplot.hlines(ae.threshold,
                  xmin=0,
                  xmax=len(reconstruction_error) - 1,
                  colors="r",
                  zorder=100,
                  label='Threshold')
    pyplot.legend()
    pyplot.ylabel("Dist")
    pyplot.xlabel("Data point index")
    #pyplot.show()
    pyplot.savefig(img, format='png')
    img.seek(0)

    plot_url = base64.b64encode(img.getvalue()).decode()
    rsp = '<img src="data:image/png;base64,{}">'.format(plot_url)

    # return rsp
    # return render_template("home.html", graph=rsp, data=df3.to_html(),baccuracy=BalancedAccuracy)
    return render_template("home.html",
                           graph=rsp,
                           data="Balanced Accuracy = " +
                           str(round(BalancedAccuracy, 2)),
                           data3=df3.to_html(),
                           data2=round(end, 2))
Beispiel #6
0
#    normal test path
    
data_dir_path = '/Users/Shariful/Documents/DataCamp/ADFA-LD(tf-idf)'
model_dir_path = '/Users/Shariful/Documents/DataCamp/models/'

##    ecg_data = pd.read_csv(data_dir_path + '/train_normal.csv', header=None)
#    ecg_data = pd.read_csv(train_path, header = None, skiprows = 1)
#    ecg_data = ecg_data.drop(ecg_data.columns[0], axis = 1)  
#    print(ecg_data.head())
#    
#    ecg_np_data = ecg_data.as_matrix()
#    scaler = MinMaxScaler()
#    ecg_np_data = scaler.fit_transform(ecg_np_data)
#    print(ecg_np_data.shape)
#
ae = LstmAutoEncoder()
##
#    # fit the data and save model into model_dir_path
#    ae.fit(ecg_np_data, model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
##    ae.fit(ecg_np_data, model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
#    load test set
#    test_atk = pd.read_csv(data_dir_path + '/test_attack.csv', header=None)
#    test_nml = pd.read_csv(data_dir_path + '/test_normal.csv', header=None)
#    df_test = pd.concat([test_atk, test_nml])
#    lab_test = pd.concat([pd.DataFrame([1] * len(test_atk)), \
#                          pd.DataFrame([0] * len(test_nml))], ignore_index=True)
    
test_idx_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_test_idx.csv'
Beispiel #7
0
def main():
    train_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train/5_gram.csv'
#    attack test path
#    test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_attack_16.csv'
#    normal test path
    
    data_dir_path = '/Users/Shariful/Documents/DataCamp/ADFA-LD(tf-idf)'
    model_dir_path = '/Users/Shariful/Documents/DataCamp/models/'

##    ecg_data = pd.read_csv(data_dir_path + '/train_normal.csv', header=None)
#    ecg_data = pd.read_csv(train_path, header = None, skiprows = 1)
#    ecg_data = ecg_data.drop(ecg_data.columns[0], axis = 1)  
#    print(ecg_data.head())
#    
#    ecg_np_data = ecg_data.as_matrix()
#    scaler = MinMaxScaler()
#    ecg_np_data = scaler.fit_transform(ecg_np_data)
#    print(ecg_np_data.shape)
#
    ae = LstmAutoEncoder()
##
#    # fit the data and save model into model_dir_path
#    ae.fit(ecg_np_data, model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
##    ae.fit(ecg_np_data, model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
#    load test set
#    test_atk = pd.read_csv(data_dir_path + '/test_attack.csv', header=None)
#    test_nml = pd.read_csv(data_dir_path + '/test_normal.csv', header=None)
#    df_test = pd.concat([test_atk, test_nml])
#    lab_test = pd.concat([pd.DataFrame([1] * len(test_atk)), \
#                          pd.DataFrame([0] * len(test_nml))], ignore_index=True)
    
    test_idx_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_test_idx.csv'
    df_test_idx = pd.read_csv(test_idx_path, header = None, skiprows = 1)
    test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_test.csv'
    df_test = pd.read_csv(test_path, header = None, skiprows = 1)
#    df_test = df_test.drop(df_test.columns[0], axis = 1)
    df_test = df_test.loc[:127949,:]
    
#    anomaly samples 4373-4433
    df_test = df_test.as_matrix()
    scaler = MinMaxScaler()
    df_test = scaler.fit_transform(df_test)
    
       
    anomaly_information = ae.anomaly(df_test)
    anomaly_information = list(anomaly_information)
    
    for idx_seq in range(0, 121): #len(df_test_idx)
        
        start_idx = df_test_idx.loc[idx_seq,0]
        end_idx = df_test_idx.loc[idx_seq,1]
        
        predict_label = 'normal'
        for idx in range(start_idx, end_idx+1):
            if anomaly_information[idx][0]:
                predict_label = 'abnormal'
                break
        print(predict_label)
def main():
    #================read training dataset====================

    #    train_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train/5_gram.csv'
    #    attack test path
    #    test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_attack_2.csv'
    #    test_data = pd.read_csv(test_path, index_col=0, usecols=[0,1,2,3,4,5])
    #    test_data_np = test_data.as_matrix()
    #    normal test path

    #    data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data'
    data_dir_path = (r'/Users/Shariful/Documents/SysCallDataset/PreparedData'
                     r'/Canali_dataset/sliding_window_5')
    #    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models'
    model_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/'
                      r'syscall_anomaly/Canali/trained_models')

    score_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/'
                      r'syscall_anomaly/Canali/scores')

    canali_data = pd.read_csv(data_dir_path + '/train_set.csv', header=None)
    #    canali_data = pd.read_csv(data_dir_path + '/train_set.csv', \
    #                           index_col=0, usecols=[0,1,2,3,4,5])

    #==================Fit the LSTM model=====================
    #    ['0','1','2','3','4']
    #    canali_data = canali_data.iloc[:, 0:-1]
    #    print(canali_data.head())
    canali_np_data = canali_data.as_matrix()
    #    scaler = MinMaxScaler()
    #    canali_np_data = scaler.fit_transform(canali_np_data)
    #    print(canali_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    ae.fit(canali_np_data, model_dir_path=model_dir_path, batch_size=1000, \
           epochs=20, estimated_negative_sample_ratio=None)

    #==========Load the saved model===========

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)

    #=============read test dataset===============

    #    test data set
    test_idx_path = data_dir_path + '/test_set_index_range_label.csv'
    df_test_idx = pd.read_csv(test_idx_path, header=None)

    test_path = data_dir_path + '/test_set.csv'
    df_test = pd.read_csv(test_path, header=None)

    df_test_np = df_test.as_matrix()
    #    df_test_np = df_test_np[0:123649,:]

    test_labels = np.array(df_test_idx.iloc[:, -1])

    #    ecg_np_test_data = canali_np_data[0:43559, :]
    #    test_data_np = np.vstack((ecg_np_test_data, test_data_np))

    #================predict scores on testing set============

    #    anomaly_information = ae.anomaly(canali_np_data[:23, :])
    anomaly_information = ae.anomaly(df_test_np, threshold=150)
    #    reconstruction_error = []
    idx_out = 0
    max_scores = np.zeros((df_test_idx.shape[0]))
    for idx_in, (is_anomaly, dist) in enumerate(anomaly_information):
        #        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        #        reconstruction_error.append(dist)

        #finding the maximum score out of all subsequences' scores
        if idx_in <= df_test_idx.loc[idx_out][:][1]:
            if max_scores[idx_out] < dist:
                max_scores[idx_out] = dist
        else:
            idx_out += 1
            max_scores[idx_out] = dist

#    visualize_reconstruction_error(reconstruction_error, ae.threshold)
    visualize_reconstruction_error(max_scores, ae.threshold)

    #=============load and plot the computed scores on testing set==============

    #    max_scores = pd.read_csv('/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset/lstm_128_units.csv', \
    #                            header = None)
    #    visualize_reconstruction_error(max_scores, 150)

    #    draw the roc curve
    plot_ROC(test_labels, max_scores)

    #    save the computed scores
    np.savetxt(score_dir_path + '/lstm_128_units.csv',
               max_scores,
               delimiter=",")