def generate_feature_mapping(): """ Train tfidf vectorizer with training data """ extractTFIDFMemoryFriendly(source_code_dir, 'txt', maxfeatures=500, export=True, mapping_path=feature_mapping)
def main(): try: argumentParser = defineArguments() arguments = argumentParser.parse_args() prettyPrint("Welcome to \"Oedipus\". Riddle me this!") ################################################# # MODE 1: Generate obfuscated source code files # ################################################# #done 调用Tigress生成混淆文件和.label文件(标记对应混淆文件使用了哪种混淆) if arguments.mode == "generate": if arguments.verbose == "yes": prettyPrint("Generating obfusted programs for programs under \"%s\"" % arguments.sourcedir, "debug") # Get programs from source directory [random/pre-existent] sourceFiles = sorted(glob.glob("%s/*.c" % arguments.sourcedir)) if len(sourceFiles) < 1: prettyPrint("No files were found in \"%s\". Exiting" % arguments.sourcedir, "error") return generationStatus = program_generation.generateObfuscatedPrograms(sourceFiles, arguments.tigressdir, int(arguments.obfuscationlevel), arguments.obfuscationfunction) # Generate obfuscated programs prettyPrint("Successfully generated obfuscated programs") ######################################################### # MODE 2: Extract features from obfuscated source files # ######################################################### #done 提取特征 elif arguments.mode == "extract": # Load obfuscated files if not os.path.exists(arguments.sourcedir): prettyPrint("Unable to locate \"%s\". Exiting" % arguments.sourcedir, "error") return sourceFiles = sorted(glob.glob("%s/*.c" % arguments.sourcedir))#返回sourcedir目录下所有以.c结尾的文件并排序,sorted()返回一个新的List if len(sourceFiles) < 1: prettyPrint("No files were found in \"%s\". Exiting" % arguments.sourcedir) # Remove source files without ".label" files for targetFile in sourceFiles: if not os.path.exists(targetFile.replace(".c", ".label")): prettyPrint("File \"%s\" does not have a label/metadata file. Removing" % targetFile, "warning") sourceFiles.pop( sourceFiles.index(targetFile) )#如果.c文件没有对应的.lable文件,则对其不进行后面的处理(提取TF-IDF)) ######################################################################## # (2.0) Extract TF-IDF features from GDB generated traces of KLEE inputs prettyPrint("Extracting TF-IDF from GDB traces") if not feature_extraction.extractTFIDF(arguments.sourcedir, sourceFiles): prettyPrint("Could not extract features from source files. Exiting", "error") return ######################################################################## prettyPrint("Alright!! Alles in Ordnung.", "info2") cleanUp() return ########################################################### # MODE 3: Project data samples into <x>-dimensional space # ########################################################### #done else可以执行到???? elif arguments.mode.find("visualize") != -1: if arguments.mode == "visualize": prettyPrint("Plotting data into %s-dimensional space with \"%s\" features." % (arguments.dimension, arguments.datatype)) data_visualization.visualizeData(arguments.sourcedir, arguments.datatype, arguments.dimension, algorithm=arguments.visualalgorithm) else: data_visualization.visualizeOriginal(arguments.sourcedir, arguments.datatype, arguments.dimension, algorithm=arguments.visualalgorithm) ############################################################################## # MODE 4: Classify obfuscated programs using knowledge-based classification # ############################################################################## #done elif arguments.mode == "classify-exp1": # Check the requested algorithm #使用朴素贝叶斯模型,操作和else中决策树基本相同(参考else中注释),只是中间调用了不同函数 if arguments.algorithm == "bayes": # Classify using Naive Bayes if arguments.datatype.find("idf") == -1: prettyPrint("Naive Bayes does not support the data type \"%s\". Exiting" % arguments.datatype, "warning") #return # Load data from source directory X, y, allClasses = loadFeaturesFromDir(arguments.sourcedir, arguments.datatype, arguments.datalabel) reductionMethod = raw_input("Please choose a dimensionality reduction method (selectkbest/pca): ").lower() classificationLog = open("classificationlog_%s_exp1_%s_%s.txt" % (arguments.datatype, reductionMethod, arguments.algorithm), "a") # A file to log all classification labels classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype)) #判断使用哪种方法减少特征向量的维度 if reductionMethod == "selectkbest": accuracies, timings = [], [] targetDimensions = [8, 16, 32, 64, 128]#[64, 128, 256, 512, 1000] for dimension in targetDimensions: if arguments.verbose == "yes": prettyPrint("Training a naive Bayes classifier with %s selected \"%s\" features" % (dimension, arguments.datatype), "debug") accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, kFold=int(arguments.kfold), reduceDim=reductionMethod, targetDim=dimension) prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output") accuracies.append(averageList(accuracyRates)) timings.append(averageList(allTimings)) # Log classifications for foldIndex in range(len(predictedLabels)): classificationLog.write("Target Dimensionality: %s\n" % dimension) for labelIndex in range(len(predictedLabels[foldIndex])): classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]])) classificationLog.close() # Plot accuracies graph prettyPrint("Plotting accuracies") data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Selected Features", "Classification Accuracy", "Classification Accuracy: Selected Features (%s)" % arguments.datatype, "accuracy_%s_exp1_%s_selectkbest.pdf" % (arguments.datatype, arguments.algorithm)) # Plot performance graph print (timings) #prettyPrint("Plotting performance") #data_visualization.plotAccuracyGraph(targetDimensions, timings, "Number of Selected Features", "Classification Timing (sec)", "Classification Timing: Selected Features (%s)" % arguments.datatype) elif reductionMethod == "pca": accuracies, timings = [], [] targetDimensions = [8, 16, 32, 64, 128]#[2, 4, 8, 16, 32, 64, 128, 256, 512, 1000] for dimension in targetDimensions: if arguments.verbose == "yes": prettyPrint("Training a naive Bayes classifier with %s extracted \"%s\" features" % (dimension, arguments.datatype), "debug") accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, kFold=int(arguments.kfold), reduceDim=reductionMethod, targetDim=dimension) prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output") accuracies.append(averageList(accuracyRates)) timings.append(averageList(allTimings)) # Log classifications for foldIndex in range(len(predictedLabels)): classificationLog.write("Target Dimensionality: %s\n" % dimension) for labelIndex in range(len(predictedLabels[foldIndex])): classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]])) classificationLog.close() # Plot accuracies graph prettyPrint("Plotting accuracies") data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Extracted Features", "Classification Accuracy", "Classification Accuracy: PCA (%s)" % arguments.datatype, "accuracy_%s_exp1_%s_pca.pdf" % (arguments.datatype, arguments.algorithm)) # Plot performance graph print (timings) #prettyPrint("Plotting performance") #data_visualization.plotAccuracyGraph(targetDimensions, timings, "Number of Extracted Features", "Classification Timing (sec)", "Classification Timing: PCA (%s)" % arguments.datatype) else: accuracyRates, allProbabilities, allTimings, predictedLabels = classification.classifyNaiveBayes(X, y, kFold=int(arguments.kfold)) prettyPrint("Average classification accuracy: %s%%, achieved in an average of %s seconds" % (averageList(accuracyRates)*100.0, averageList(allTimings)), "output") #################### # Using CART trees # #################### #done 使用决策树、KFold训练数据,并输出结果(准确率,预期结果,实际结果,性能等) elif arguments.algorithm == "tree": # Classify using CART trees if arguments.datatype != "triton": prettyPrint("It is recommended to use \".triton\" features", "warning") # Load data from source directory #X所有特征值,y每个文件混淆方法的索引(针对allCLasses),allClasses包含所有混淆方法 X, y, allClasses = loadFeaturesFromDir(arguments.sourcedir, arguments.datatype, arguments.datalabel) splittingCriterion = raw_input("Please choose a splitting criterion (gini/entropy): ") classificationLog = open("classificationlog_%s_exp1_%s_%s.txt" % (arguments.datatype, splittingCriterion, arguments.algorithm), "a") # A file to log all classification labels classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype)) #maxDepth = raw_input("Please choose a maximum depth for the tree (0 = Maximum Possible): ") # Should be (2,4,8,16) accuracies, timings, allDepths = [], [], [2,3,4,5,6,7,8,10,12,14,16]#,32,64] for maxDepth in allDepths: if arguments.verbose == "yes": prettyPrint("Training a \"CART\" with \"%s\" criterion and maximum depth of %s" % (splittingCriterion, maxDepth), "debug") accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyTreeKFold(X, y, int(arguments.kfold), splittingCriterion, int(maxDepth), visualizeTree=False) #print accuracyRates, allProbabilities prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output")#计算多次准确率的平均值 accuracies.append(averageList(accuracyRates)) timings.append(averageList(allTimings)) # Log classifications for foldIndex in range(len(predictedLabels)): classificationLog.write("Tree Depth: %s\n" % maxDepth) for labelIndex in range(len(predictedLabels[foldIndex])): classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]])) classificationLog.close() # Plot accuracies graph prettyPrint("Plotting accuracies for \"%s\" criterion" % splittingCriterion) data_visualization.plotAccuracyGraph(allDepths, accuracies, "Maximum Tree Depth", "Classification Accuracy", "Classification Accuracy: %s (%s)" % (splittingCriterion, arguments.datatype), "accuracy_%s_exp1_%s_%s.pdf" % (arguments.datatype, splittingCriterion, arguments.algorithm)) # Plot performance graph #prettyPrint("Plotting timings") #data_visualization.plotAccuracyGraph(allDepths, timings, "Maximum Tree Depth", "Classification Timing (sec)", "Classification Timing: %s (%s)" % (splittingCriterion, arguments.datatype)) print (timings) return ################################################################## # MODE 6: Classify obfuscated programs using the 36-4 experiment # ################################################################## #done 对本来提取特征以后的tfidf文件再进行了特征提取,而且用到KFold交叉验证方法 elif arguments.mode == "classify-exp2": # Retrieve the list of all programs allPrograms = glob.glob("%s/*.c" % arguments.originalprograms)#list(set(sorted(glob.glob("%s/*.c" % arguments.sourcedir))) - set(sorted(glob.glob("%s/*-*.c" % arguments.sourcedir)))) allPrograms.sort() # Makes it easier to keep track of current programs in batch totalPrograms = len(allPrograms) prettyPrint("Successfully retrieved %s original programs" % totalPrograms) chunkSize = totalPrograms/int(arguments.kfold) # 4 = 40 / 10 (default) if arguments.algorithm == "tree": criterion = raw_input("Please choose a splitting criterion (gini/entropy): ") allValues = [2,3,4,5,6,7,8,10,12,14,16]#,32,64] # The allowed depths of the tree elif arguments.algorithm == "bayes": criterion = raw_input("Please choose a dimensionality reduction method (SelectKBest/PCA): ").lower() allValues = [8,16,32,64,128]# if criterion.lower() == "selectkbest" else [8,16,32,64,128] # Define the structure of the accuracy and timing matrices # numpy.zeros(x,y)创建x行y列的矩阵 allAccuracyRates, allTimings = numpy.zeros((int(arguments.kfold), len(allValues))), numpy.zeros((int(arguments.kfold), len(allValues))) classificationLog = open("classificationlog_%s_exp2_%s_%s.txt" % (arguments.datatype, criterion, arguments.algorithm), "a") # A file to log all classification labels classificationLog.write("Experiment 2 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype)) for currentCycle in range(10): prettyPrint("Cycle #%s out of %s cycles" % (currentCycle+1, int(arguments.kfold))) trainingPrograms, testPrograms = [] + allPrograms, [] # Specify the indices of the training and test datasets testStartIndex = (totalPrograms + (chunkSize * currentCycle)) % totalPrograms testStopIndex = testStartIndex + chunkSize if arguments.verbose == "yes": prettyPrint("Retrieving training and test programs for the current cycle", "debug") # Populate the test dataset testPrograms = trainingPrograms[testStartIndex:testStopIndex] # Remove the indices from trainingPrograms trainingPrograms = [x for x in trainingPrograms if not x in trainingPrograms[testStartIndex:testStopIndex]] if arguments.verbose == "yes": prettyPrint("Original training programs: %s, original test programs: %s" % (len(trainingPrograms), len(testPrograms)), "debug") # Now load the training and test samples from the source directory # 1- First we need to retrieve the obfuscated versions of the tempTraining, tempTest = [], [] #得到训练集、测试集(.c文件)中每个文件对应的保存特征向量的文件(例如:.tfidf文件) for program in trainingPrograms: programName = program.replace(arguments.originalprograms, "").replace("/","") # Isolate program name 去掉路径,仅保留文件名 # TODO: Important: For 40 programs, programs are like "anagram_1231231231_12.c" # TODO: for "obf" programs, programs are like "empty-Seed1-Random......-addOpaque16.c" separator = "_" if arguments.sourcedir.find("40programs") != - 1 else "-" #print "%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype) obfuscatedVersions = glob.glob("%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype)) #print programName, len(obfuscatedVersions) #print "%s/%s_*.%s" % (arguments.sourcedir, programName.replace(".c", ""), arguments.datatype) if len(obfuscatedVersions) > 0: tempTraining += obfuscatedVersions #print programName, len(obfuscatedVersions) for program in testPrograms: programName = program.replace(arguments.originalprograms, "").replace("/","") # Isolate program name # TODO: Important: For 40 programs, programs are like "anagram_1231231231_12.c" # TODO: for "obf" programs, programs are like "empty-Seed1-Random......-addOpaque16.c" separator = "_" if arguments.sourcedir.find("40programs") != - 1 else "-" obfuscatedVersions = glob.glob("%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype)) if len(obfuscatedVersions) > 0: tempTest += obfuscatedVersions trainingPrograms, testPrograms = tempTraining, tempTest # Update the training and test programs if arguments.verbose == "yes": prettyPrint("Successfully retrieved %s training and %s test programs" % (len(trainingPrograms), len(testPrograms)), "debug") # (Added January 15): Generate the TF-IDF features on the fly if arguments.verbose == "yes": prettyPrint("Generating TF-IDF features for the current training and test traces", "debug") #对.tfidf文件继续采用TF-IDF再提取特征,保存到.ifidf_str文件中 if feature_extraction.extractTFIDFMemoryFriendly(trainingPrograms, arguments.datatype, 128, "%s_tr" % arguments.datatype): prettyPrint("Successfully generated TF-IDF features for the current training batch") else: prettyPrint("Unable to generate TF-IDF features for the current training batch", "warning") continue # Now for the test batch if feature_extraction.extractTFIDFMemoryFriendly(testPrograms, arguments.datatype, 128, "%s_te" % arguments.datatype): prettyPrint("Successfully generated TF-IDF features for the current test batch") else: prettyPrint("Unable to generate TF-IDF features for the current test batch", "warning") continue # Now load the programs of the given datatype prettyPrint("Loading training and test instances") Xtr, ytr, allClassestr = loadFeaturesFromList(trainingPrograms, "%s_tr" % arguments.datatype, arguments.datalabel) Xte, yte, allClasseste = loadFeaturesFromList(testPrograms, "%s_te" % arguments.datatype, arguments.datalabel, allClassestr) # Now apply the classification algorithm # 训练模型 for value in allValues: ############## # CART Trees # ############## if arguments.algorithm == "tree": prettyPrint("Training a \"CART\" with \"%s\" criterion and maximum depth of %s" % (criterion, value), "debug") currentAccuracyRate, currentTiming, currentProbabilities, predictedLabels = classification.classifyTree(Xtr, ytr, Xte, yte, criterion, int(value), visualizeTree=False) prettyPrint("Classification accuracy with \"%s\" and \"%s\" is: %s%%" % (criterion, value, (currentAccuracyRate*100.0)), "output") #print "before!!!! currentCycle: %s, value: %s, allValues.index(value): %s" % (currentCycle, value, allValues.index(value)) allAccuracyRates[currentCycle][allValues.index(value)] = currentAccuracyRate allTimings[currentCycle][allValues.index(value)] = currentTiming #print "after assignments" # Log the results classificationLog.write("Depth: %s\n" % value) #print len(yte), len(predictedLabels), len(testPrograms) for index in range(len(testPrograms)): classificationLog.write("%s: Class: %s, Predicted: %s\n" % (testPrograms[index], allClasseste[yte[index]], allClasseste[predictedLabels[index]])) #print "after writing" ########################### # Multinomial Naive Bayes # ########################### elif arguments.algorithm == "bayes": prettyPrint("Training a \"Multinomial Naive Bayes\" with \"%s\" criterion and dimensionality of %s" % (criterion, value), "debug") currentAccuracyRate, currentTiming, currentProbabilities, predictedLabels = classification.classifyNaiveBayes(Xtr, ytr, Xte, yte, criterion, int(value)) #print accuracyRates, allProbabilities prettyPrint("Classification accuracy with \"%s\" and \"%s\" is: %s%%" % (criterion, value, (currentAccuracyRate*100.0)), "output") allAccuracyRates[currentCycle][allValues.index(value)] = currentAccuracyRate allTimings[currentCycle][allValues.index(value)] = currentTiming # Log the results classificationLog.write("Dimensionality: %s\n" % value) #print len(yte), len(predictedLabels), len(testPrograms) for index in range(len(testPrograms)): classificationLog.write("%s: Class: %s, Predicted: %s\n" % (testPrograms[index], allClasseste[yte[index]], allClasseste[predictedLabels[index]])) # TODO (Added January 15): Remove all TF-IDF files of the current batch if arguments.verbose == "yes": prettyPrint("Removing all TF-IDF files of the current batch", "debug") rmCounter = 0 for featureFile in glob.glob("%s/*.%s_t*" % (arguments.sourcedir, arguments.datatype)): # TODO: This will remove tfidf_both you stupid f**k!! os.unlink(featureFile) rmCounter += 1 prettyPrint("Successfully removed %s files" % rmCounter) classificationLog.close() # Now average the scored results stored in the matrices pointsX, pointsYacc, pointsYtime = [], [], [] for value in allValues: pointsX.append(value) pointsYacc.append(averageList(allAccuracyRates[:,allValues.index(value)])) pointsYtime.append(averageList(allTimings[:,allValues.index(value)])) # Plot accuracies and timings graphs if arguments.algorithm == "tree": xAxisLabel = "Maximum Tree Depth" elif arguments.algorithm == "bayes": xAxisLabel = "Selected Features" if criterion == "select" else "Extracted Features" prettyPrint("Plotting accuracies for \"%s\" criterion" % criterion) data_visualization.plotAccuracyGraph(pointsX, pointsYacc, xAxisLabel, "Classification Accuracy", "Classification Accuracy: %s (%s)" % (criterion, arguments.datatype), "accuracy_%s_exp2_%s_%s.pdf" % (arguments.datatype, criterion, arguments.algorithm)) #prettyPrint("Plotting timings") #data_visualization.plotAccuracyGraph(pointsX, pointsYtime, "Maximum Tree Depth", "Classification Timing (sec)", "Classification Timing: %s (%s)" % (criterion, arguments.datatype)) #################################### # MODE X : Filter generated traces # #################################### #done 过滤数据 elif arguments.mode == "filter-traces": # Retrieve the necessary parameters inExtension = raw_input("Input extension (Default: dyndis): ") outExtension = raw_input("Output extension (Default: dyndis_raw): ") filterMode = raw_input("Filteration mode {raw (Default), mem, both}: ") if filterTraces(arguments.sourcedir, inExtension, filterMode, outExtension, arguments.filterfunction): prettyPrint("Successfully filtered \"%s\" traces to \"%s\" traces using the \"%s\" filter" % (inExtension, outExtension, filterMode)) else: prettyPrint("Some error occurred during filteration", "warning") ######################################################## # MODE XI: Generate TF-IDF feature vectors from traces # ######################################################## #done 对输入文件提取指定的特征 elif arguments.mode == "extract-from-traces": # Retrieve the necessary paramters inExtension = raw_input("Input extension (Default: dyndis): ") outExtension = raw_input("Output extension (Default: tfidf_raw): ") maxFeatures = int(raw_input("Maximum features: ")) if feature_extraction.extractTFIDFMemoryFriendly(arguments.sourcedir, inExtension, maxFeatures, outExtension): prettyPrint("Successfully extracted %s TF-IDF features from traces with \"%s\" extension" % (maxFeatures, inExtension)) else: prettyPrint("Some error occurred during TF-IDF feature extraction", "warning") except Exception as e: #global garbage prettyPrint("Error encountered in \"main\": %s at line %s" % (e, sys.exc_info()[2].tb_lineno), "error") #print garbage cleanUp() return
def main(): source_dir = 'D:\\BGU\\dataset\\home\\vagrant\\random_programs' number_of_programs = 2000 tigress_dir = '/oedipus/tigress-2.2' obfuscation_level = 1 obfuscation_function = 'SECRET' max_features = 1000 kfold = 10 if not checkpoint(1): if random_programs.generate_random_programs(source_dir, number_of_programs, obfuscation_function): prettyPrint("Successfully generated %d random programs" % number_of_programs) else: prettyPrint("Some error occurred during random program generation", "warning") if not checkpoint(2): # Get programs from source directory [random/pre-existent] sourceFiles = sorted(glob.glob("%s%s*.c" % (source_dir, os.sep))) if len(sourceFiles) < 1: prettyPrint("No files were found in \"%s\". Exiting" % source_dir, "error") return generationStatus = program_generation.generateObfuscatedPrograms(sourceFiles, tigress_dir, obfuscation_level, obfuscation_function) prettyPrint("Successfully generated obfuscated programs") if not checkpoint(3): if not os.path.exists(source_dir): prettyPrint("Unable to locate \"%s\". Exiting" % source_dir, "error") return sourceFiles = sorted(glob.glob("%s%s*.c" % (source_dir, os.sep))) if len(sourceFiles) < 1: prettyPrint("No files were found in \"%s\". Exiting" % source_dir) for targetFile in sourceFiles: if not os.path.exists(targetFile.replace(".c", ".label")): prettyPrint("File \"%s\" does not have a label/metadata file. Removing" % targetFile, "warning") sourceFiles.pop( sourceFiles.index(targetFile) ) if os.path.exists(targetFile.replace(".c", ".dyndis")): prettyPrint("File \"%s\" already have generated dumps. Removing" % targetFile, "warning") sourceFiles.pop( sourceFiles.index(targetFile) ) prettyPrint("Generating static traces") if not feature_extraction.extractTFIDF(source_dir, sourceFiles): prettyPrint("Could not generate traces from source files. Exiting", "error") return prettyPrint("Successfully generated traces") cleanUp() if not checkpoint(4): flavors = ['objdump', 'objdumps'] tfidf_flavors = ['tfidfobj', 'tfidfobjs'] for i,flavor in enumerate(flavors): filter_modes = ['raw', 'both'] for filter in filter_modes: filtered_input_ext = flavor + '_' + filter output_ext = tfidf_flavors[i] + ('_both' if filter == 'both' else '') if filterTraces(source_dir, flavor, filter, filtered_input_ext, obfuscation_function): prettyPrint("Successfully filtered \"%s\" traces to \"%s\" traces using the \"%s\" filter" % (filter, filtered_input_ext, filter)) else: prettyPrint("Some error occurred during filteration", "warning") if feature_extraction.extractTFIDFMemoryFriendly(source_dir, filtered_input_ext, max_features, output_ext): prettyPrint("Successfully extracted %s TF-IDF features from traces with \"%s\" extension" % (max_features, filtered_input_ext)) else: prettyPrint("Some error occurred during TF-IDF feature extraction", "error") return if not checkpoint(5): experiences = ['exp1', 'exp2'] for exp in experiences: data_flavors = ['tfidfobj', 'tfidfobjs', 'tfidfobj_both', 'tfidfobjs_both'] for flavor in data_flavors: algorithms = ["bayes", "tree"] for algo in algorithms: if algo == 'bayes': X, y, allClasses, originalPrograms = loadFeaturesFromDir(source_dir, flavor, 'label') for reduction_method in ['selectkbest', 'pca', 'none']: if os.path.exists("accuracy_%s_%s_%s_%s.pdf" % (flavor, exp, algo, reduction_method)): continue classificationLog = open("classificationlog_%s_%s_%s_%s.txt" % (flavor, exp, reduction_method, algo), "a") # A file to log all classification labels classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (algo, flavor)) if reduction_method == "selectkbest": accuracies, timings = [], [] targetDimensions = [8, 16, 32, 64, 128]#[64, 128, 256, 512, 1000] for dimension in targetDimensions: accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, originalPrograms, kFold=kfold, reduceDim=reduction_method, targetDim=dimension, exp2=(exp == 'exp2')) prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output") accuracies.append(averageList(accuracyRates)) timings.append(averageList(allTimings)) # Log classifications for foldIndex in range(len(predictedLabels)): classificationLog.write("Target Dimensionality: %s\n" % dimension) for labelIndex in range(len(predictedLabels[foldIndex])): classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]])) classificationLog.close() # Plot accuracies graph prettyPrint("Plotting accuracies") data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Selected Features", "Classification Accuracy", "Classification Accuracy: Selected Features (%s)" % flavor, "accuracy_%s_%s_%s_selectkbest.pdf" % (flavor, exp, algo)) # Plot performance graph print(timings) elif reduction_method == "pca": accuracies, timings = [], [] targetDimensions = [8, 16, 32, 64, 128]#[2, 4, 8, 16, 32, 64, 128, 256, 512, 1000] for dimension in targetDimensions: accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, originalPrograms, kFold=kfold, reduceDim=reduction_method, targetDim=dimension, exp2=(exp == 'exp2')) prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output") accuracies.append(averageList(accuracyRates)) timings.append(averageList(allTimings)) # Log classifications for foldIndex in range(len(predictedLabels)): classificationLog.write("Target Dimensionality: %s\n" % dimension) for labelIndex in range(len(predictedLabels[foldIndex])): classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]])) classificationLog.close() # Plot accuracies graph prettyPrint("Plotting accuracies") data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Extracted Features", "Classification Accuracy", "Classification Accuracy: PCA (%s)" % flavor, "accuracy_%s_%s_%s_pca.pdf" % (flavor, exp, algo)) # Plot performance graph print(timings) else: accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, originalPrograms, kFold=kfold, exp2=(exp == 'exp2')) prettyPrint("Average classification accuracy: %s%%, achieved in an average of %s seconds" % (averageList(accuracyRates)*100.0, averageList(allTimings)), "output") #################### # Using CART trees # #################### elif algo == "tree": # Load data from source directory X, y, allClasses, originalPrograms = loadFeaturesFromDir(source_dir, flavor, 'label') for splitting_criterion in ['gini', 'entropy']: if os.path.exists("accuracy_%s_%s_%s_%s.pdf" % (flavor, exp, splitting_criterion, algo)): continue classificationLog = open("classificationlog_%s_%s_%s_%s.txt" % (flavor, exp, splitting_criterion, algo), "a") # A file to log all classification labels classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (algo, flavor)) accuracies, timings, allDepths = [], [], [2,3,4,5,6,7,8,10,12,14,16]#,32,64] for maxDepth in allDepths: accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyTreeKFold(X, y, originalPrograms, kfold, splitting_criterion, int(maxDepth), visualizeTree=False, exp2=(exp == 'exp2')) #print accuracyRates, allProbabilities prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output") accuracies.append(averageList(accuracyRates)) timings.append(averageList(allTimings)) # Log classifications for foldIndex in range(len(predictedLabels)): classificationLog.write("Tree Depth: %s\n" % maxDepth) for labelIndex in range(len(predictedLabels[foldIndex])): classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]])) classificationLog.close() # Plot accuracies graph prettyPrint("Plotting accuracies for \"%s\" criterion" % splitting_criterion) data_visualization.plotAccuracyGraph(allDepths, accuracies, "Maximum Tree Depth", "Classification Accuracy", "Classification Accuracy: %s (%s)" % (splitting_criterion, flavor), "accuracy_%s_%s_%s_%s.pdf" % (flavor, exp, splitting_criterion, algo)) print(timings)