def SVDDTest(dataset, labelCol, beginDataCol, endDataCol, listOfSplits, thresholds): cb = [ 0 ] # Only needed for the creation of DatasetHandler object ("real" codebooks only needed for # for anomaly detection testing using ECOC method. SP = Splitter() dh = DatasetHandler(cb) SVDD = OneClassSVM(gamma='auto') holdoutIndices = getHoldoutIndices(dataset, labelCol, beginDataCol, endDataCol) allData, allOriginalLabels = dh.getData(dataset, labelCol, beginDataCol, endDataCol) savedOriginalLabels = allOriginalLabels.copy( ) # All labels required for assignLabel() (not trimmed version) initTrimmedAllData, initTrimmedAllOriginalLabels, initScaledData = \ processOriginalData(dh, allData, allOriginalLabels) for split in listOfSplits: knownPredictionAccuracies = [] unknownPredictionAccuracies = [] for holdoutIndex in holdoutIndices: # Working with copies of the data so that we only need to import the data once per # Otherwise the data gets changed slightly per run. trimmedAllOriginalLabels = initTrimmedAllOriginalLabels.copy() scaledData = initScaledData.copy() listOfUnknownClasses, listOfKnownClasses, holdoutClass = \ SP.assignLabel(trimmedAllOriginalLabels, savedOriginalLabels, split, holdoutIndex) knownThresholdBuildingData, knownThresholdBuildingLabels, singleDataSamples, singleDataSamplesLabels, knownData, \ knownLabels, unknownThresholdBuildingData, unknownThresholdBuildingLabels, holdoutData, holdoutLabels \ = SP.splitDataAndLabels(scaledData, trimmedAllOriginalLabels, listOfUnknownClasses, holdoutClass) # Train SVDD model SVDD.fit(knownThresholdBuildingData) # Test on known data and unknown data knownPredictions = SVDD.predict(singleDataSamples).tolist() unknownPredictions = SVDD.predict(holdoutData).tolist() # Inliners are predicted as 1's by SVDD. Inliners are equivalent to predictions that are being # classified as known. knownPredictionAccuracy = knownPredictions.count(1) * 1.0 / len( knownPredictions) # Outliers are predicted as -1's by SVDD. Outliers are equivalent to predictions that are being classified # as unknown. unknownPredictionAccuracy = unknownPredictions.count( -1) * 1.0 / len(unknownPredictions) knownPredictionAccuracies.append((knownPredictionAccuracy)) unknownPredictionAccuracies.append((unknownPredictionAccuracy)) print("Split:", split) print("\tKnown prediction accuracy across all holdouts:", np.mean(knownPredictionAccuracies)) print("\tUnknown prediction accuracy across all holdouts:", np.mean(unknownPredictionAccuracies))
def getHoldoutIndices(dataset, labelsColumn, dataBeginIndex, dataEndIndex): dh = DatasetHandler([-1]) data, labels = dh.getData(dataset, labelsColumn, dataBeginIndex, dataEndIndex) indicesToRemove, dataToRemove, labelsToRemove = dh.getSmallClasses( data, labels) holdoutIndices = dh.getHoldoutIndices(labels, labelsToRemove) return holdoutIndices
def clusteringTest(dataset, labelCol, beginDataCol, endDataCol, listOfSplits, thresholds): cb = [ 0 ] # Only needed for the creation of DatasetHandler object ("real" codebooks only needed for # for anomaly detection testing using ECOC method. SP = Splitter() dh = DatasetHandler(cb) CAD = ClusteringAnomalyDetection() holdoutIndices = getHoldoutIndices(dataset, labelCol, beginDataCol, endDataCol) allData, allOriginalLabels = dh.getData(dataset, labelCol, beginDataCol, endDataCol) savedOriginalLabels = allOriginalLabels.copy( ) # All labels required for assignLabel() (not trimmed version) initTrimmedAllData, initTrimmedAllOriginalLabels, initScaledData = \ processOriginalData(dh, allData, allOriginalLabels) for split in listOfSplits: knownPredictionAccuracies = [] unknownPredictionAccuracies = [] for holdoutIndex in holdoutIndices: # Working with copies of the data so that we only need to import the data once per # Otherwise the data gets changed slightly per run. trimmedAllOriginalLabels = initTrimmedAllOriginalLabels.copy() scaledData = initScaledData.copy() listOfUnknownClasses, listOfKnownClasses, holdoutClass = \ SP.assignLabel(trimmedAllOriginalLabels, savedOriginalLabels, split, holdoutIndex) knownThresholdBuildingData, knownThresholdBuildingLabels, singleDataSamples, singleDataSamplesLabels, knownData, \ knownLabels, unknownThresholdBuildingData, unknownThresholdBuildingLabels, holdoutData, holdoutLabels \ = SP.splitDataAndLabels(scaledData, trimmedAllOriginalLabels, listOfUnknownClasses, holdoutClass) # Using knownTresholdBuildingData to fit the clustering algorithm because the model is essentially acting # as the threshold (it's the thing determining whether or not a sample of data is known or unknown). # SingleDataSamples and holdoutData are used to test the model because the same splits of data are used to # test the threshold in the ECOC anomaly detection approach. unknownPredictionAccuracy, knownPredictionAccuracy = \ CAD.runAnomalyDetectionTests(knownThresholdBuildingData, singleDataSamples, holdoutData, thresholds) knownPredictionAccuracies.append((knownPredictionAccuracy)) unknownPredictionAccuracies.append((unknownPredictionAccuracy)) print("Split:", split) print("\tKnown prediction accuracy across all holdouts:", np.mean(knownPredictionAccuracies)) print("\tUnknown prediction accuracy across all holdouts:", np.mean(unknownPredictionAccuracies))
def runAnomalyDetectionTests(listOfCBs, listOfThresholds, listOfNewSplits, dataset, labelCol, beginDataCol, endDataCol, classifier, folderPathAcc, folderPathHDs, ROCPath, buildTresholdHistogramPath, confusionMatrixPath): # Determine which classes classes to cycle through (ignoring 'small' classes) holdoutIndices = getHoldoutIndices(dataset, labelCol, beginDataCol, endDataCol) iterationCount = 1 optimalThresholds = [] listOfDifferences = [] unknownAccuracies = [] knownAccuracies = [] codebookNum = 0 splitter = Splitter() trainer = Trainer() tm = ThresholdManager() IL = IncrementalLearningFunctions() vis = IncrementalLearningVisuals() for codebook in listOfCBs: # All the dictionaries below are used in creating the graph of all # the accuracies across all splits (accuraciesPlot()) # Max unknownMaxAccDictionary = {} knownMaxAccDictionary = {} thresholdMaxDictionary = {} # Min unknownMinAccDictionary = {} knownMinAccDictionay = {} thresholdMinDictionary = {} # Var unknownVarDictionary = {} knownVarDictionary = {} thresholdVarDictionary = {} # Means unknownMeanDictionary = {} knownMeanDictionary = {} thresholdMeanDictionary = {} dh = DatasetHandler(codebook) allData, allOriginalLabels = dh.getData(dataset, labelCol, beginDataCol, endDataCol) savedOriginalLabels = allOriginalLabels.copy( ) # All labels required for assignLabel() (not trimmed version) initTrimmedAllData, initTrimmedAllOriginalLabels, initScaledData, codewordColumns = \ processOriginalData(dh, allData, allOriginalLabels, savedOriginalLabels) codebookNum += 1 for split in listOfNewSplits: # Used for ROC knownAccuraciesToAverage = [] unknownAccuraciesToAverage = [] highestKnownAccuracies = [] highestUnknownAccuracies = [] # Lists which will contain the data necessary to create a confusion matrix. predictions = [] actuals = [] for holdout in holdoutIndices: # Working with copies of the data so that we only need to import the data once per # Otherwise the data gets changed slightly per run. trimmedAllData = initTrimmedAllData.copy() trimmedAllOriginalLabels = initTrimmedAllOriginalLabels.copy() scaledData = initScaledData.copy() listOfUnknownClasses, listOfKnownClasses, holdoutClass = \ splitter.assignLabel(trimmedAllOriginalLabels, savedOriginalLabels, split, holdout) knownThresholdBuildingData, knownThresholdBuildingLabels, singleDataSamples, singleDataSamplesLabels, knownData, \ knownLabels, unknownThresholdBuildingData, unknownThresholdBuildingLabels, holdoutData, holdoutLabels \ = splitter.splitDataAndLabels(scaledData, trimmedAllOriginalLabels, listOfUnknownClasses, holdoutClass) # Ensuring number of unknown threshold building data samples never exceeds known data samples if len(unknownThresholdBuildingData) > len( knownThresholdBuildingData): unknownThresholdBuildingData, unknownThresholdBuildingLabels, = \ splitter.reduceThresholdBuildingSamples_FewestClasses(knownThresholdBuildingData, unknownThresholdBuildingData, unknownThresholdBuildingLabels) knownCWLabels = trainer.convertLabelToCodeword( codewordColumns, knownLabels) listOfClassifiers = trainer.trainClassifiers( knownData, knownCWLabels, classifier, knownLabels) # Getting predictions on all relevant data: unknownThresholdBuildingPreds, singleDataSamplesPreds, knownThresholdBuildingPreds = \ getPredictions(unknownThresholdBuildingData, singleDataSamples, knownThresholdBuildingData, listOfClassifiers, trainer) # Getting the shortest hamming distance that each prediction corresponds to: unknownThresholdBuildingHDs, singleDataSamplesHDs, knownThresholdBuildingHDs = \ getMinimumHammingDistanceLists(trainer, codebook, unknownThresholdBuildingPreds, singleDataSamplesPreds, knownThresholdBuildingPreds) optimalThreshold, lowestDifference, highestKnownAcc, highestUnknownAcc = \ tm.findOptimalThreshold(listOfThresholds, knownThresholdBuildingHDs, unknownThresholdBuildingHDs) # Updating the predicted codewords. Used for creating confusion matrix (not needed otherwise, yet). unknownECOCPreds, singleDataSamplesECOCPreds, knownThresholdBuildingECOCPreds = \ updatePredictions(trainer, codebook, unknownThresholdBuildingPreds, singleDataSamplesPreds, knownThresholdBuildingPreds, optimalThreshold) predictions.append(singleDataSamplesECOCPreds) # Labels aren't converted to codewords yet codewordSDSLabels =\ trainer.toCodeword(trainer.convertLabelToCodeword(codewordColumns, singleDataSamplesLabels)) actuals.append(codewordSDSLabels) # Graphs histogram showing the process of building the threshold (different "view" of what this method # is showing slightly below). # The final argument "True" is used to determine where this function should save to file (read # function's comment in the DataManagement class to read more). # vis.graphThresholdTestHistogram(knownThresholdBuildingHDs, unknownThresholdBuildingHDs, optimalThreshold, # codebookNum, split, highestKnownAcc, # highestUnknownAcc, 12, holdout, allData, # unknownThresholdBuildingData, knownData, codebook, # singleDataSamples, buildTresholdHistogramPath, classifier, True) # Data for generating ROC # knownAccuraciesAll, unknownAccuraciesAll = tm.testAllThresholds(listOfThresholds, # knownThresholdBuildingHDs, unknownThresholdBuildingHDs) # knownAccuraciesToAverage.append(knownAccuraciesAll) # unknownAccuraciesToAverage.append(unknownAccuraciesAll) # Getting accuracies of predictions (whether known or unknown): # knownHoldoutDataThresholdAcc = tm.knownThresholdTest(singleDataSamplesHDs, optimalThreshold) # unknownHoldoutDataThresholdAcc = tm.unknownThresholdTest(holdoutClassHDs, optimalThreshold) testData, testLabels, holdoutData, holdoutLabels = \ splitter.holdoutClassSplit(holdoutData, holdoutLabels) listOfPotentialCodewords = trainer.getPredictions( holdoutData, listOfClassifiers) vis.graphCodewordFrequency(listOfPotentialCodewords) vis.graphBitFrequency(listOfPotentialCodewords) finalCodeword = IL.generateCodeword_Averaged( listOfPotentialCodewords) print( "Accuracy:", IL.testGeneratedCodeword(codebook, listOfClassifiers, finalCodeword, testData, 1000)) iterationCount += 1 optimalThresholds.append(optimalThreshold) highestKnownAccuracies.append(highestKnownAcc) highestUnknownAccuracies.append(highestUnknownAcc) listOfDifferences.append(lowestDifference) # unknownAccuracies.append(unknownHoldoutDataThresholdAcc) # knownAccuracies.append(knownHoldoutDataThresholdAcc) #Graphing to see how threshold is performing/testing threshold visualization: # The final argument "False" is used to determine where this function should save to file (read # function's comment in the DataManagement class to read more). # vis.graphThresholdTestHistogram(singleDataSamplesHDs, holdoutClassHDs, optimalThreshold, codebookNum, # split, knownHoldoutDataThresholdAcc, unknownHoldoutDataThresholdAcc, # 12, holdoutClass, trimmedAllData, unknownThresholdBuildingData, knownData, # codebook, singleDataSamples, folderPathHDs, classifier, False) # ROC # averagedKnownAccuracies = tm.averageThresholdAccuracies(knownAccuraciesToAverage) # averagedUnknownAccuracies = tm.averageThresholdAccuracies(unknownAccuraciesToAverage) # averagedBestKnownAcc = np.mean(highestKnownAccuracies) # averagedBestUnknownAcc = np.mean(highestUnknownAccuracies) # averagedBestThreshold = np.mean(optimalThresholds) # vis.graphROC(averagedUnknownAccuracies, averagedKnownAccuracies, split, codebook, ROCPath, # classifier, averagedBestKnownAcc, averagedBestUnknownAcc, averagedBestThreshold, codebookNum) # Confusion matrix # vis.generateConfusionMatrix(predictions, actuals, codebook, confusionMatrixPath, classifier, codebookNum, # split) # printResults(unknownAccuracies, knownAccuracies, optimalThresholds, codebookNum, split) # thresholdMaxDictionary[split] = max(optimalThresholds) # thresholdMinDictionary[split] = min(optimalThresholds) # thresholdVarDictionary[split] = np.var((optimalThresholds)) # thresholdMeanDictionary[split] = np.mean(optimalThresholds) # # # Used for creating accuracies graph at the end ('accuraciesPlot()') # knownMaxAccDictionary[split] = max(knownAccuracies) # knownMinAccDictionay[split] = min(knownAccuracies) # knownVarDictionary[split] = np.var(knownAccuracies) # knownMeanDictionary[split] = np.mean(knownAccuracies) # # unknownMaxAccDictionary[split] = max(unknownAccuracies) # unknownMinAccDictionary[split] = min(unknownAccuracies) # unknownVarDictionary[split] = np.var(unknownAccuracies) # unknownMeanDictionary[split] = np.mean((unknownAccuracies)) optimalThresholds = [] unknownAccuracies = [] knownAccuracies = [] iterationCount = 1