Exemple #1
0
def getPCA(train_data, test_data):
    originalData = train_data
    #Extract the observation
    xData = originalData.drop(' Label', axis=1)
    #Define a StandardScaler
    scaling = StandardScaler()
    #Standardise the data
    X_std = scaling.fit_transform(xData)

    #Construct PCA data
    pca_std = PCA().fit(X_std)

    #Transform to PCA components
    pcaData = pca_std.transform(X_std)

    #Create the column for pcaData
    pcaCol = []
    for i in range(pcaData.shape[1]):
        col = 'Component' + str(i + 1)
        pcaCol += [col]

    #Convernt numpy array to data frame
    pcaDf = pd.DataFrame(data=pcaData, columns=pcaCol)

    #Add Label column to pcaDf
    pcaDf[' Label'] = originalData[' Label'].values
    #Save the result in a pickle file
    util.pklSaver(pcaDf,
                  'PCA_data',
                  path=util.getResourcePath() +
                  '/Pickle Files/Data for model construction/First layer/')

    #Convert test Data to PCA
    testDataSet = test_data
    testData = testDataSet.drop(' Label', axis=1)

    #Using the same scaling in the train data to transform the test data
    testDataStd = scaling.transform(testData)
    test_pcaData = pca_std.transform(testDataStd)

    #Convernt numpy array to data frame
    test_pcaDf = pd.DataFrame(data=test_pcaData, columns=pcaCol)

    #Add Label column to pcaDf
    test_pcaDf[' Label'] = testDataSet[' Label'].values
    #Save the result in a pickle file
    util.pklSaver(test_pcaDf,
                  'PCA_testData',
                  path=util.getResourcePath() +
                  '/Pickle Files/Data for model construction/First layer/')
Exemple #2
0
def getRanking(train_data, filename):
    # Examining feature importance by excluding three common features
    train_data = train_data.drop(
        [' Source Port', ' Destination Port', ' Protocol'], axis=1)

    #Seperating observation and label
    xData = train_data.drop([' Label'], axis=1)
    y = train_data[' Label'].values

    #Standardise the value of observations
    X_std = StandardScaler().fit_transform(xData)
    # feature extraction
    filterModel = SelectKBest(score_func=f_classif)
    model = filterModel.fit(X_std, y)

    # Construct Extra tree
    extraTree = classifier.extraTrees(train_data)
    # Construct gradient boosting
    gradientModel = classifier.lightGBM_model(train_data, customEval=True)

    #anova_imp= MinMaxScaler().fit_transform(np.array([model.scores_]).T).T
    # et_imp = MinMaxScaler().fit_transform(np.array([extraTree2.feature_importances_]).T).T
    # gd_imp = MinMaxScaler().fit_transform(np.array([gradientModel2.feature_importance()]).T).T

    # Save the results of threes algorithm to a data frame
    order = pd.DataFrame(
        {
            'ANOVA F-value': model.scores_,
            'Extra Tree': extraTree.feature_importances_,
            'Gradient Boosting': gradientModel.feature_importance()
        },
        index=xData.columns)
    order = order.fillna(0)

    #Rescale the value of each algorithm in the range [0, 1]
    orderScaled = MinMaxScaler().fit_transform(order)
    orderScaled = pd.DataFrame(orderScaled,
                               index=order.index,
                               columns=order.columns)

    #Adding one more column to sum the value of three algorithms
    orderScaled['Total'] = orderScaled.sum(axis=1)
    orderScaled = orderScaled.round(3)

    #Save the result to a pickle file and sort the sum value from the highest to the lowest
    util.pklSaver(orderScaled,
                  filename,
                  path=util.getResourcePath() +
                  '/Pickle Files/Feature Importance/')
    print(orderScaled.sort_values('Total', ascending=False))
def getAllData(folderPath=util.getResourcePath() +
               '/Pickle Files/Original Data/Benigns/'):
    #Get the list of files in the folder
    files = []
    for (dirPath, dirNames, fileName) in walk(folderPath):
        files.extend(fileName)

    #Initial the allBenign
    allData = util.pklReader(files[0], path=folderPath)

    #Append each benign to allBenign
    for i in range(1, len(files)):
        pklData = util.pklReader(files[i], path=folderPath)
        allData = allData.append(pklData)
    print('Total length of allData: ', len(allData))

    return allData
def secondLayerWeights():
    #Loading the test set
    allAttack20 = util.pklReader(
        'AllAttack20',
        path=util.getResourcePath() +
        '/Pickle Files/Data for model construction/Second layer/')

    # Convert the tring lable to numeric number
    count = 1
    for att in allAttack20:
        allAttack20[att][' Label'] = count
        count = count + 1

    #A dictionary to store the number of misclassification for each value of lightGBM weight
    allErr = {}
    start = time.time()

    #Calculate the prediction accuracy for each attack
    for i in range(50):
        #randomly select value for lightGBM
        a = np.random.uniform(low=0.7, high=1)
        totalErr = 0
        for attack in allAttack20:
            X_test = allAttack20[attack].drop(
                [' Source Port', ' Destination Port', ' Protocol', ' Label'],
                axis=1)
            y_test = allAttack20[attack][' Label'].values
            y_predict = getPrediction(X_test, a)

            error = (y_predict != y_test).sum()
            totalErr = totalErr + error

        allErr[a] = totalErr
        print('The weight of lightGBM: ', a, ', Number of errors: ', totalErr)

    #Calculate the time
    end = time.time()
    runningTime = (end - start) / 60

    #lightGBM weight is the one that minimise the number of misclassification
    lightGBM_weight = min(allErr, key=lambda key: allErr[key])

    print('Running time: ', runningTime)
    print('Max AUC: ', lightGBM_weight, allErr[lightGBM_weight])

    return lightGBM_weight
def dataProcessor(day, inputData):
    #Perform some analysis
    print('Number of column: ', len(inputData.columns))

    #Display distinct value in Lable column
    labelSet = set(inputData[' Label'])
    print('The distinct value in Label column: ', labelSet)

    #Original shape
    origShape = inputData.shape
    print('Shape of original data: ', origShape)

    #Remove unnecessary columns
    rmvCol = ['Flow ID', ' Source IP', ' Destination IP', ' Timestamp']
    inputData = inputData.drop(rmvCol, axis=1)

    #Check NaN and Infinity value in two columns
    objCol = ['Flow Bytes/s', ' Flow Packets/s']

    #Convert 'Flow Bytes/s' and 'Flow Packets/s' to float (NaN and Infinity value will become nan and inf respectively)
    inputData[objCol] = inputData[objCol].astype(float)

    #Replace inf by nan
    inputData[objCol] = inputData[objCol].replace(np.inf, np.nan)
    print('Total number of nan: ', inputData[objCol].isna().sum())
    print('before: ', inputData.shape)
    # #Drop nan rows
    inputData = inputData.dropna()
    print('The final shape of the data: ', inputData.shape)

    #Check duplicate data after removing column and nan, inf
    origShape = inputData.shape
    inputData.drop_duplicates()
    rmvShape = inputData.shape
    print('Shape of the dataset after removing duplicates: ', rmvShape)
    print('Number of duplicate rows: ', origShape[0] - rmvShape[0])

    #Store data of each attack in pkl file
    labelList = list(labelSet)
    count = 0
    for label in labelList:
        dataLabel = inputData[inputData[' Label'] == label]
        print('Number of ' + label + ' :', dataLabel.shape)
        count += dataLabel.shape[0]
        #pklSaver(dataLabel,label)

        if (label == 'BENIGN'):
            util.pklSaver(dataLabel,
                          day + '-Benign',
                          path=util.getResourcePath() +
                          '/Pickle Files/Original Data/Benigns/')

        else:
            util.pklSaver(dataLabel,
                          label,
                          path=util.getResourcePath() +
                          '/Pickle Files/Original Data/Attacks/')

    print('Total: ', count)

    return {'label': labelSet, 'data': inputData}
    for i in range(1, len(files)):
        pklData = util.pklReader(files[i], path=folderPath)
        allData = allData.append(pklData)
    print('Total length of allData: ', len(allData))

    return allData


#Perform dataPreprocessing
dataProcessor('Tues', tues)

#Get all Benign
allBenign = getAllData()
util.pklSaver(allBenign,
              'All Benign',
              path=util.getResourcePath() + '/Pickle Files/Original Data/')

#Get all attack
allAttacks = getAllData(folderPath=util.getResourcePath() +
                        '/Pickle Files/Original Data/Attacks/')
util.pklSaver(allAttacks,
              'All Attacks',
              path=util.getResourcePath() + '/Pickle Files/Original Data/')

# Convert the Label of allAttack to 1 and allBenign to 0
allAttacks[' Label'] = 1
allBenign[' Label'] = 0
allData = allAttacks.append(allBenign)

#Display the length of all data and save it to 'All Data' pkl file
print('Length of all data: ', len(allData))
Exemple #7
0
    totalErr=0
    totalLen=0
    #Evaluate the accuracy of each attack
    for attack in testSet:    
            #Separate obsevation and label
            X_test=testSet[attack].drop([' Source Port', ' Destination Port', ' Protocol',' Label'],axis=1)
            y_test=testSet[attack][' Label'].values
            
            #Get the prediction for the observation
            y_predict=secondLayer.getClassification(X_test)
    
            #Calculate the number of misclassification
            error=(y_predict!=y_test).sum()
            totalErr+=error
            totalLen+=len(testSet[attack])
            #Display attack name, total observation, number of misclassification and the accuracy
            print(attack,', misclassification rate: ',error,'/',len(testSet[attack]),', Accuracy: ',100-error*100/len(y_predict))
            
    print('Total number of misclassification: ',totalErr)
    print('Overall accuracy: ',1-totalErr/totalLen)

#Display the evaluation result
#Loading the test set for the first layer
test_set=util.pklReader('Testset',path=util.getResourcePath() +'/Pickle Files/Data for model construction/First Layer/')
print('Evaluating the performance of the first layer ...')
firstLayerEval(test_set)

#Loading the test set for the second layer
allAttack20=util.pklReader('AllAttack20',path=util.getResourcePath() +'/Pickle Files/Data for model construction/Second layer/')
print('\nEvaluating the performance of the second layer ...')
secondLayerEval(allAttack20)
'''

import util
import firstLayer
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np

#Loading the real traffic
real_traffic=util.dataConvRealTraff('HulkWithTime.pcap_ISCX')
#Label the DoS as the attack target port 8080
real_traffic.ix[real_traffic[' Destination Port']==8080, ' Label'] = 1

#Load the test set of the evaluation dataset
testDataSet=util.pklReader('Testset',path=util.getResourcePath()+'/Pickle Files/Data for model construction/First Layer/')

def getRocInfo(data):    
    #Exclude the label from observations
    x_test=data.drop(' Label',axis=1)
    y_test=data[' Label'].values
    
    #Get the prediction for the data
    pred=firstLayer.getPred(x_test)
    
    #Compute fpr, tpr 
    fpr, tpr, threshold = metrics.roc_curve(y_test, pred)
    roc_auc=metrics.auc(fpr,tpr)
    
    return {'pred': pred, 'fpr': fpr, 'tpr': tpr, 'auc':roc_auc}
    #If the dataset is created from PCA
    else:
        col = list(train_data.columns)
        featureList = col[:n] + [' Label']

        data = train_data[featureList]
        testSet = test_data[featureList]

    return {'train': data, 'test': testSet}


## Perform hyperparameters for the first layer
# Loading train and test set
originalData = util.pklReader(
    'Original',
    path=util.getResourcePath() +
    '/Pickle Files/Data for model construction/First layer/')
testDataSet = util.pklReader(
    'Testset',
    path=util.getResourcePath() +
    '/Pickle Files/Data for model construction/First layer/')
impList = util.pklReader(
    'ImportanceList',
    path=util.getResourcePath() +
    '/Pickle Files/Data for model construction/First layer/')
pca_trainData = util.pklReader(
    'PCA_data',
    path=util.getResourcePath() +
    '/Pickle Files/Data for model construction/First layer/')
pca_testData = util.pklReader(
    'PCA_testData',
def hyperparaTuning(data,
                    testSet,
                    expName,
                    mode=2,
                    storedPath=util.getResourcePath() +
                    '/Pickle Files/Models/First Layer/'):
    # Construct the set of hyperparameters for each algorithm
    etTree_params = {
        "n_estimators": [150, 250, 350],
        "max_features": [None, 'sqrt', 'log2'],
        "min_samples_leaf": [64, 128, 256]
    }

    lightGBM_params = {
        "learning_rate": [0.06, 0.08, 0.1],
        "num_leaves": [15, 31, 63],
        "max_bin": [63, 127, 255],
        "feature_fraction": [0.6, 0.8, 0.9]
    }

    knn_params = {
        "n_neighbors": np.arange(5, 47, 2),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "chebyshev"]
    }

    #Construct a model for each algorithm
    et_model = ExtraTreesClassifier()
    lgbm_model = lgbm.LGBMClassifier(objective='binary')
    knn_model = KNeighborsClassifier()

    #Construct the training and test data
    trainData = data.drop(' Label', axis=1)
    y_train = data[' Label'].values

    testData = testSet.drop(' Label', axis=1)
    y_test = testSet[' Label'].values

    #Perform Extremely Randomized ALgorithm
    if (mode == 1):
        modelName = 'ExtraTrees'
        params = etTree_params
        model = et_model

    #Performing LightGBM
    elif (mode == 2):
        modelName = 'LightGBM'
        params = lightGBM_params
        model = lgbm_model

    #Performing KNN algorithm
    else:
        modelName = 'KNN'
        params = knn_params
        model = knn_model

        #Standardise the data in the case of KNN
        scaling = StandardScaler()
        trainData = scaling.fit_transform(trainData)
        testData = scaling.transform(testData)

    # tune the hyperparameters via a cross-validated Randomized search
    grid = RandomizedSearchCV(model, params, verbose=1, cv=5, n_jobs=1)
    start = time.time()
    grid.fit(trainData, y_train)

    #Calculate the time
    end = time.time()
    runningTime = (end - start) / 60

    # evaluate the best grid searched model on the testing data
    preds = grid.predict_proba(testData)
    auc = roc_auc_score(y_test, preds[:, 1])

    print("Experiment: ", expName)
    print("Randomized search best parameters: {}".format(grid.best_params_))
    print("AUC of the best model: ", auc)
    print("Running time: ", runningTime)

    #Save the model
    util.pklSaver(grid, expName, path=storedPath + modelName + '/')
Exemple #11
0
def firstLayerWeights():
    #Loading test data and two subsets of test data
    subset1 = util.pklReader(
        'subset1',
        path=util.getResourcePath() +
        '/Pickle Files/Data for model construction/Subset/')
    subset2 = util.pklReader(
        'subset2',
        path=util.getResourcePath() +
        '/Pickle Files/Data for model construction/Subset/')

    #Loading the best model of each algorithms
    et_model = util.pklReader('Exp1',
                              path=util.getResourcePath() +
                              '/Pickle Files/Models/First Layer/ExtraTrees/')
    lgbm_model = util.pklReader('Exp2',
                                path=util.getResourcePath() +
                                '/Pickle Files/Models/First Layer/LightGBM/')

    #Uses subset1 as training data to search for weight
    impList = util.pklReader('ImportanceList',
                             path=util.getResourcePath() +
                             '/Pickle Files/Data for model construction/')
    colET = list(
        subset1.drop(
            [' Source Port', ' Destination Port', ' Protocol', ' Label'],
            axis=1).columns)
    x_subset1ET = subset1[colET]
    x_subset1LGBM = subset1[impList[:35]]
    y_subset1 = subset1[' Label'].values

    #used subset2 as test data
    #x_subset2=subset2.drop(' Label',axis=1)
    x_subset2ET = subset2[colET]
    x_subset2LGBM = subset2[impList[:35]]
    y_subset2 = subset2[' Label'].values

    #A dictionary to store AUC score of each value of lightGBM weights
    auc_score = {}
    start = time.time()

    #Used only lightGBM and ExtraTrees
    for i in range(100):
        #randomly select value for lightGBM
        a = np.random.uniform(low=0.5, high=1)
        b = 1 - a

        predict = a * lgbm_model.predict_proba(
            x_subset1LGBM) + b * et_model.predict_proba(x_subset1ET)
        auc = roc_auc_score(y_subset1, predict[:, 1])
        auc_score[a] = auc
        print('The weight of lightGBM: ', a, ', AUC score: ', auc)

    #Calculate the time
    end = time.time()
    runningTime = end - start

    #Get the weight of lightGBM which maximise AUC score
    lightGBM_weight = max(auc_score, key=lambda key: auc_score[key])

    #Display results
    print('Running time: ', runningTime)
    print('Max AUC: ', lightGBM_weight, auc_score[lightGBM_weight])

    #Compare the result with individual algorithm
    print(
        'LightGBM',
        roc_auc_score(y_subset1,
                      lgbm_model.predict_proba(x_subset1LGBM)[:, 1]))
    print('ExtraTrees',
          roc_auc_score(y_subset1,
                        et_model.predict_proba(x_subset1ET)[:, 1]))

    #For the second subset
    predict = lightGBM_weight * lgbm_model.predict_proba(x_subset2LGBM) + (
        1 - lightGBM_weight) * et_model.predict_proba(x_subset2ET)
    auc = roc_auc_score(y_subset2, predict[:, 1])
    print('\n The performance on the second subset')
    print('The ensemble model: ', auc)
    print(
        'LightGBM',
        roc_auc_score(y_subset2,
                      lgbm_model.predict_proba(x_subset2LGBM)[:, 1]))
    print('ExtraTrees',
          roc_auc_score(y_subset2,
                        et_model.predict_proba(x_subset2ET)[:, 1]))

    return lightGBM_weight
attNames=['BruteForce','DoS','Web','Bot','PortScan','DDoS']
#The number of attack
n_attack=len(attNames)
#clfKey is a list represent 15 pairs of 6 attack types
clfKeys=[]
for i in range(n_attack-1):
    for j in range (i+1,n_attack):
        clfKeys+=[attNames[i]+'-'+attNames[j]]

#Construct the second layer of the IDS system using one vs one approach
 
#Using ExtraTrees classifier to construct 45 models
et_clfModels={}
lgbm_clfModels={}
for clfkey in clfKeys:
    lgbm_clfModels[clfkey]=util.pklReader(clfkey,path=util.getResourcePath()+'/Pickle Files/Models/Second Layer/LightGBM/')


#Using constructed models to predict the data
def getClassification(X_test):
    initialData=np.zeros((len(X_test),n_attack))
    predictResult=pd.DataFrame(initialData,columns=attNames)

    #Sum the probability of each classifier
    for clfkey in clfKeys:   
        key=clfkey.split('-')
        lgbm_clfModel=lgbm_clfModels[clfkey]
        y_predict=lgbm_clfModel.predict_proba(X_test)
        predictResult[key[0]]+=y_predict[:,0]
        predictResult[key[1]]+=y_predict[:,1]