コード例 #1
0
ファイル: MainSVDD.py プロジェクト: Xammed/Anomaly-Detection
def SVDDTest(dataset, labelCol, beginDataCol, endDataCol, listOfSplits,
             thresholds):
    cb = [
        0
    ]  # Only needed for the creation of DatasetHandler object ("real" codebooks  only needed for
    # for anomaly detection testing using ECOC method.
    SP = Splitter()
    dh = DatasetHandler(cb)
    SVDD = OneClassSVM(gamma='auto')

    holdoutIndices = getHoldoutIndices(dataset, labelCol, beginDataCol,
                                       endDataCol)
    allData, allOriginalLabels = dh.getData(dataset, labelCol, beginDataCol,
                                            endDataCol)
    savedOriginalLabels = allOriginalLabels.copy(
    )  # All labels required for assignLabel() (not trimmed version)
    initTrimmedAllData, initTrimmedAllOriginalLabels, initScaledData = \
                                processOriginalData(dh, allData, allOriginalLabels)

    for split in listOfSplits:
        knownPredictionAccuracies = []
        unknownPredictionAccuracies = []
        for holdoutIndex in holdoutIndices:
            # Working with copies of the data so that we only need to import the data once per
            # Otherwise the data gets changed slightly per run.
            trimmedAllOriginalLabels = initTrimmedAllOriginalLabels.copy()
            scaledData = initScaledData.copy()

            listOfUnknownClasses, listOfKnownClasses, holdoutClass = \
                SP.assignLabel(trimmedAllOriginalLabels, savedOriginalLabels, split, holdoutIndex)

            knownThresholdBuildingData, knownThresholdBuildingLabels, singleDataSamples, singleDataSamplesLabels, knownData, \
            knownLabels, unknownThresholdBuildingData, unknownThresholdBuildingLabels, holdoutData, holdoutLabels \
                = SP.splitDataAndLabels(scaledData, trimmedAllOriginalLabels, listOfUnknownClasses, holdoutClass)

            # Train SVDD model
            SVDD.fit(knownThresholdBuildingData)

            # Test on known data and unknown data
            knownPredictions = SVDD.predict(singleDataSamples).tolist()
            unknownPredictions = SVDD.predict(holdoutData).tolist()

            # Inliners are predicted as 1's by SVDD. Inliners are equivalent to predictions that are being
            # classified as known.
            knownPredictionAccuracy = knownPredictions.count(1) * 1.0 / len(
                knownPredictions)

            # Outliers are predicted as -1's by SVDD. Outliers are equivalent to predictions that are being classified
            # as unknown.
            unknownPredictionAccuracy = unknownPredictions.count(
                -1) * 1.0 / len(unknownPredictions)

            knownPredictionAccuracies.append((knownPredictionAccuracy))
            unknownPredictionAccuracies.append((unknownPredictionAccuracy))
        print("Split:", split)
        print("\tKnown prediction accuracy across all holdouts:",
              np.mean(knownPredictionAccuracies))
        print("\tUnknown prediction accuracy across all holdouts:",
              np.mean(unknownPredictionAccuracies))
コード例 #2
0
def getHoldoutIndices(dataset, labelsColumn, dataBeginIndex, dataEndIndex):
    dh = DatasetHandler([-1])
    data, labels = dh.getData(dataset, labelsColumn, dataBeginIndex,
                              dataEndIndex)
    indicesToRemove, dataToRemove, labelsToRemove = dh.getSmallClasses(
        data, labels)
    holdoutIndices = dh.getHoldoutIndices(labels, labelsToRemove)
    return holdoutIndices
コード例 #3
0
def clusteringTest(dataset, labelCol, beginDataCol, endDataCol, listOfSplits,
                   thresholds):
    cb = [
        0
    ]  # Only needed for the creation of DatasetHandler object ("real" codebooks  only needed for
    # for anomaly detection testing using ECOC method.
    SP = Splitter()
    dh = DatasetHandler(cb)
    CAD = ClusteringAnomalyDetection()

    holdoutIndices = getHoldoutIndices(dataset, labelCol, beginDataCol,
                                       endDataCol)
    allData, allOriginalLabels = dh.getData(dataset, labelCol, beginDataCol,
                                            endDataCol)
    savedOriginalLabels = allOriginalLabels.copy(
    )  # All labels required for assignLabel() (not trimmed version)
    initTrimmedAllData, initTrimmedAllOriginalLabels, initScaledData = \
                                processOriginalData(dh, allData, allOriginalLabels)

    for split in listOfSplits:
        knownPredictionAccuracies = []
        unknownPredictionAccuracies = []
        for holdoutIndex in holdoutIndices:
            # Working with copies of the data so that we only need to import the data once per
            # Otherwise the data gets changed slightly per run.
            trimmedAllOriginalLabels = initTrimmedAllOriginalLabels.copy()
            scaledData = initScaledData.copy()

            listOfUnknownClasses, listOfKnownClasses, holdoutClass = \
                SP.assignLabel(trimmedAllOriginalLabels, savedOriginalLabels, split, holdoutIndex)

            knownThresholdBuildingData, knownThresholdBuildingLabels, singleDataSamples, singleDataSamplesLabels, knownData, \
            knownLabels, unknownThresholdBuildingData, unknownThresholdBuildingLabels, holdoutData, holdoutLabels \
                = SP.splitDataAndLabels(scaledData, trimmedAllOriginalLabels, listOfUnknownClasses, holdoutClass)

            # Using knownTresholdBuildingData to fit the clustering algorithm because the model is essentially acting
            # as the threshold (it's the thing determining whether or not a sample of data is known or unknown).
            # SingleDataSamples and holdoutData are used to test the model because the same splits of data are used to
            # test the threshold in the ECOC anomaly detection approach.
            unknownPredictionAccuracy, knownPredictionAccuracy = \
                CAD.runAnomalyDetectionTests(knownThresholdBuildingData, singleDataSamples, holdoutData, thresholds)

            knownPredictionAccuracies.append((knownPredictionAccuracy))
            unknownPredictionAccuracies.append((unknownPredictionAccuracy))
        print("Split:", split)
        print("\tKnown prediction accuracy across all holdouts:",
              np.mean(knownPredictionAccuracies))
        print("\tUnknown prediction accuracy across all holdouts:",
              np.mean(unknownPredictionAccuracies))
コード例 #4
0
def runAnomalyDetectionTests(listOfCBs, listOfThresholds, listOfNewSplits,
                             dataset, labelCol, beginDataCol, endDataCol,
                             classifier, folderPathAcc, folderPathHDs, ROCPath,
                             buildTresholdHistogramPath, confusionMatrixPath):

    # Determine which classes classes to cycle through (ignoring 'small' classes)
    holdoutIndices = getHoldoutIndices(dataset, labelCol, beginDataCol,
                                       endDataCol)

    iterationCount = 1
    optimalThresholds = []
    listOfDifferences = []
    unknownAccuracies = []
    knownAccuracies = []

    codebookNum = 0
    splitter = Splitter()
    trainer = Trainer()
    tm = ThresholdManager()
    IL = IncrementalLearningFunctions()
    vis = IncrementalLearningVisuals()

    for codebook in listOfCBs:
        # All the dictionaries below are used in creating the graph of all
        # the accuracies across all splits (accuraciesPlot())
        # Max
        unknownMaxAccDictionary = {}
        knownMaxAccDictionary = {}
        thresholdMaxDictionary = {}
        # Min
        unknownMinAccDictionary = {}
        knownMinAccDictionay = {}
        thresholdMinDictionary = {}
        # Var
        unknownVarDictionary = {}
        knownVarDictionary = {}
        thresholdVarDictionary = {}
        # Means
        unknownMeanDictionary = {}
        knownMeanDictionary = {}
        thresholdMeanDictionary = {}

        dh = DatasetHandler(codebook)
        allData, allOriginalLabels = dh.getData(dataset, labelCol,
                                                beginDataCol, endDataCol)
        savedOriginalLabels = allOriginalLabels.copy(
        )  # All labels required for assignLabel() (not trimmed version)
        initTrimmedAllData, initTrimmedAllOriginalLabels, initScaledData, codewordColumns = \
                           processOriginalData(dh, allData, allOriginalLabels, savedOriginalLabels)

        codebookNum += 1
        for split in listOfNewSplits:
            # Used for ROC
            knownAccuraciesToAverage = []
            unknownAccuraciesToAverage = []
            highestKnownAccuracies = []
            highestUnknownAccuracies = []

            # Lists which will contain the data necessary to create a confusion matrix.
            predictions = []
            actuals = []
            for holdout in holdoutIndices:
                # Working with copies of the data so that we only need to import the data once per
                # Otherwise the data gets changed slightly per run.
                trimmedAllData = initTrimmedAllData.copy()
                trimmedAllOriginalLabels = initTrimmedAllOriginalLabels.copy()
                scaledData = initScaledData.copy()

                listOfUnknownClasses, listOfKnownClasses, holdoutClass = \
                    splitter.assignLabel(trimmedAllOriginalLabels, savedOriginalLabels, split, holdout)

                knownThresholdBuildingData, knownThresholdBuildingLabels, singleDataSamples, singleDataSamplesLabels, knownData, \
                knownLabels, unknownThresholdBuildingData, unknownThresholdBuildingLabels, holdoutData, holdoutLabels \
                    = splitter.splitDataAndLabels(scaledData, trimmedAllOriginalLabels, listOfUnknownClasses, holdoutClass)

                # Ensuring number of unknown threshold building data samples never exceeds known data samples
                if len(unknownThresholdBuildingData) > len(
                        knownThresholdBuildingData):
                    unknownThresholdBuildingData, unknownThresholdBuildingLabels,  = \
                        splitter.reduceThresholdBuildingSamples_FewestClasses(knownThresholdBuildingData,
                                                       unknownThresholdBuildingData, unknownThresholdBuildingLabels)

                knownCWLabels = trainer.convertLabelToCodeword(
                    codewordColumns, knownLabels)
                listOfClassifiers = trainer.trainClassifiers(
                    knownData, knownCWLabels, classifier, knownLabels)

                # Getting predictions on all relevant data:
                unknownThresholdBuildingPreds, singleDataSamplesPreds, knownThresholdBuildingPreds = \
                                           getPredictions(unknownThresholdBuildingData, singleDataSamples,
                                                                knownThresholdBuildingData, listOfClassifiers, trainer)

                # Getting the shortest hamming distance that each prediction corresponds to:
                unknownThresholdBuildingHDs, singleDataSamplesHDs, knownThresholdBuildingHDs = \
                    getMinimumHammingDistanceLists(trainer, codebook, unknownThresholdBuildingPreds,
                                               singleDataSamplesPreds, knownThresholdBuildingPreds)

                optimalThreshold, lowestDifference, highestKnownAcc, highestUnknownAcc = \
                       tm.findOptimalThreshold(listOfThresholds, knownThresholdBuildingHDs, unknownThresholdBuildingHDs)

                # Updating the predicted codewords. Used for creating confusion matrix (not needed otherwise, yet).
                unknownECOCPreds, singleDataSamplesECOCPreds, knownThresholdBuildingECOCPreds = \
                    updatePredictions(trainer, codebook, unknownThresholdBuildingPreds,
                                  singleDataSamplesPreds, knownThresholdBuildingPreds, optimalThreshold)

                predictions.append(singleDataSamplesECOCPreds)

                # Labels aren't converted to codewords yet
                codewordSDSLabels =\
                    trainer.toCodeword(trainer.convertLabelToCodeword(codewordColumns, singleDataSamplesLabels))
                actuals.append(codewordSDSLabels)

                # Graphs histogram showing the process of building the threshold (different "view" of what this method
                # is showing slightly below).
                # The final argument "True" is used to determine where this function should save to file (read
                # function's comment in the DataManagement class to read more).
                # vis.graphThresholdTestHistogram(knownThresholdBuildingHDs, unknownThresholdBuildingHDs, optimalThreshold,
                #                                 codebookNum, split, highestKnownAcc,
                #                                 highestUnknownAcc, 12, holdout, allData,
                #                                 unknownThresholdBuildingData, knownData, codebook,
                #                                 singleDataSamples, buildTresholdHistogramPath, classifier, True)

                # Data for generating ROC
                # knownAccuraciesAll, unknownAccuraciesAll = tm.testAllThresholds(listOfThresholds,
                #                                                knownThresholdBuildingHDs, unknownThresholdBuildingHDs)
                # knownAccuraciesToAverage.append(knownAccuraciesAll)
                # unknownAccuraciesToAverage.append(unknownAccuraciesAll)

                # Getting accuracies of predictions (whether known or unknown):
                # knownHoldoutDataThresholdAcc = tm.knownThresholdTest(singleDataSamplesHDs, optimalThreshold)
                # unknownHoldoutDataThresholdAcc = tm.unknownThresholdTest(holdoutClassHDs, optimalThreshold)

                testData, testLabels, holdoutData, holdoutLabels = \
                    splitter.holdoutClassSplit(holdoutData, holdoutLabels)

                listOfPotentialCodewords = trainer.getPredictions(
                    holdoutData, listOfClassifiers)
                vis.graphCodewordFrequency(listOfPotentialCodewords)
                vis.graphBitFrequency(listOfPotentialCodewords)

                finalCodeword = IL.generateCodeword_Averaged(
                    listOfPotentialCodewords)
                print(
                    "Accuracy:",
                    IL.testGeneratedCodeword(codebook, listOfClassifiers,
                                             finalCodeword, testData, 1000))

                iterationCount += 1

                optimalThresholds.append(optimalThreshold)
                highestKnownAccuracies.append(highestKnownAcc)
                highestUnknownAccuracies.append(highestUnknownAcc)
                listOfDifferences.append(lowestDifference)
                # unknownAccuracies.append(unknownHoldoutDataThresholdAcc)
                # knownAccuracies.append(knownHoldoutDataThresholdAcc)

                #Graphing to see how threshold is performing/testing threshold visualization:
                # The final argument "False" is used to determine where this function should save to file (read
                # function's comment in the DataManagement class to read more).
                # vis.graphThresholdTestHistogram(singleDataSamplesHDs, holdoutClassHDs, optimalThreshold, codebookNum,
                #                                split, knownHoldoutDataThresholdAcc, unknownHoldoutDataThresholdAcc,
                #                                12, holdoutClass, trimmedAllData, unknownThresholdBuildingData, knownData,
                #                                codebook, singleDataSamples, folderPathHDs, classifier, False)

            # ROC
            # averagedKnownAccuracies = tm.averageThresholdAccuracies(knownAccuraciesToAverage)
            # averagedUnknownAccuracies = tm.averageThresholdAccuracies(unknownAccuraciesToAverage)
            # averagedBestKnownAcc = np.mean(highestKnownAccuracies)
            # averagedBestUnknownAcc = np.mean(highestUnknownAccuracies)
            # averagedBestThreshold = np.mean(optimalThresholds)
            # vis.graphROC(averagedUnknownAccuracies, averagedKnownAccuracies, split, codebook, ROCPath,
            #              classifier, averagedBestKnownAcc, averagedBestUnknownAcc, averagedBestThreshold, codebookNum)

            # Confusion matrix
            # vis.generateConfusionMatrix(predictions, actuals, codebook, confusionMatrixPath, classifier, codebookNum,
            #                             split)

            # printResults(unknownAccuracies, knownAccuracies, optimalThresholds, codebookNum, split)

            # thresholdMaxDictionary[split] = max(optimalThresholds)
            # thresholdMinDictionary[split] = min(optimalThresholds)
            # thresholdVarDictionary[split] = np.var((optimalThresholds))
            # thresholdMeanDictionary[split] = np.mean(optimalThresholds)
            #
            # # Used for creating accuracies graph at the end ('accuraciesPlot()')
            # knownMaxAccDictionary[split] = max(knownAccuracies)
            # knownMinAccDictionay[split] = min(knownAccuracies)
            # knownVarDictionary[split] = np.var(knownAccuracies)
            # knownMeanDictionary[split] = np.mean(knownAccuracies)
            #
            # unknownMaxAccDictionary[split] = max(unknownAccuracies)
            # unknownMinAccDictionary[split] = min(unknownAccuracies)
            # unknownVarDictionary[split] = np.var(unknownAccuracies)
            # unknownMeanDictionary[split] = np.mean((unknownAccuracies))

            optimalThresholds = []
            unknownAccuracies = []
            knownAccuracies = []
            iterationCount = 1