def main():

    datasets_names = ['random', 'simple1', 'simple2']

    for dataset1 in datasets_names:
        for dataset2 in datasets_names:
            print('Classifing: {0} - {1}'.format(dataset1, dataset2))
            X_train, y_train = load_dataset(dataset1, dataset1)
            X_test, y_test = load_dataset(dataset1, dataset2)
            accuracies, timings, allDepths = [], [], [4, 6, 8, 10, 12]
            for maxDepth in allDepths:
                accuracyRates, allTimings, allProbabilities, predictedLabels = classification.classifyTree(
                    X_train,
                    y_train,
                    X_test,
                    y_test,
                    'gini',
                    int(maxDepth),
                    visualizeTree=False)
                print("Classification accuracy: %.2f" %
                      (accuracyRates * 100.0))
                accuracies.append(accuracyRates)
                timings.append(allTimings)

            # Plot accuracies graph
            print("Plotting accuracies")
            data_visualization.plotAccuracyGraph(
                allDepths, accuracies, "Maximum Tree Depth",
                "Classification Accuracy",
                "Classification Accuracy: gini (tfidfs_cross)",
                "accuracy_{0}_{1}.pdf".format(dataset1, dataset2))
            print(timings)
Esempio n. 2
0
def main():
    source_dir = 'D:\\BGU\\Oedipus\\another_simple_programs'
    tigress_dir = '/oedipus/tigress-2.2'
    obfuscation_level = 1
    obfuscation_function = 'main'
    max_features = 1000
    kfold = 10

    if not checkpoint(1):
        # Get programs from source directory [random/pre-existent]
        sourceFiles = sorted(glob.glob("%s%s*.c" % (source_dir, os.sep)))
        if len(sourceFiles) < 1:
            prettyPrint("No files were found in \"%s\". Exiting" % source_dir,
                        "error")
            return

        generationStatus = program_generation.generateObfuscatedPrograms(
            sourceFiles, tigress_dir, obfuscation_level, obfuscation_function)
        prettyPrint("Successfully generated obfuscated programs")

    if not checkpoint(2):
        if not os.path.exists(source_dir):
            prettyPrint("Unable to locate \"%s\". Exiting" % source_dir,
                        "error")
            return
        sourceFiles = sorted(glob.glob("%s%s*.c" % (source_dir, os.sep)))
        if len(sourceFiles) < 1:
            prettyPrint("No files were found in \"%s\". Exiting" % source_dir)

        for targetFile in sourceFiles:
            if not os.path.exists(targetFile.replace(".c", ".label")):
                prettyPrint(
                    "File \"%s\" does not have a label/metadata file. Removing"
                    % targetFile, "warning")
                sourceFiles.pop(sourceFiles.index(targetFile))
            if os.path.exists(targetFile.replace(".c", ".dyndis")):
                prettyPrint(
                    "File \"%s\" already have generated dumps. Removing" %
                    targetFile, "warning")
                sourceFiles.pop(sourceFiles.index(targetFile))

        prettyPrint("Generating static traces")
        if not feature_extraction.extractTFIDF(source_dir, sourceFiles):
            prettyPrint("Could not generate traces from source files. Exiting",
                        "error")
            return

        prettyPrint("Successfully generated traces")
        cleanUp()

    if not checkpoint(3):
        flavors = ['objdumps']
        tfidf_flavors = ['tfidfobjs']
        for i, flavor in enumerate(flavors):
            filter_modes = ['both']
            for filter in filter_modes:
                filtered_input_ext = flavor + '_' + filter
                output_ext = tfidf_flavors[i] + ('_both' if filter == 'both'
                                                 else '') + '_vec'
                # if filterTraces(source_dir, flavor, filter, filtered_input_ext, obfuscation_function):
                #     prettyPrint("Successfully filtered \"%s\" traces to \"%s\" traces using the \"%s\" filter" % (filter, filtered_input_ext, filter))
                # else:
                #     prettyPrint("Some error occurred during filteration", "warning")
                train_files = test_files = glob.glob(
                    '%s%s*.%s' % (source_dir, os.sep, flavor))
                if feature_extraction.extractTFIDFWithVectorizer(
                        train_files, test_files, max_features, output_ext):
                    prettyPrint(
                        "Successfully extracted %s TF-IDF features from traces with \"%s\" extension"
                        % (max_features, filtered_input_ext))
                else:
                    prettyPrint(
                        "Some error occurred during TF-IDF feature extraction",
                        "error")
                    return

    if not checkpoint(4):
        experiences = ['exp1', 'exp2']
        for exp in experiences:
            data_flavors = ['tfidfobjs_both_vec']
            for flavor in data_flavors:
                algorithms = ["tree"]
                for algo in algorithms:
                    if algo == 'bayes':
                        X, y, allClasses, originalPrograms = loadFeaturesFromDir(
                            source_dir, flavor, 'label')
                        for reduction_method in ['selectkbest', 'pca', 'none']:
                            if os.path.exists(
                                    "accuracy_%s_%s_%s_%sanother_simple.pdf" %
                                (flavor, exp, algo, reduction_method)):
                                continue
                            classificationLog = open(
                                "classificationlog_%s_%s_%s_%sanother_simple.txt"
                                % (flavor, exp, reduction_method, algo),
                                "a")  # A file to log all classification labels
                            classificationLog.write(
                                "Experiment 1 - Algorithm: %s, Datatype: %s\n"
                                % (algo, flavor))
                            if reduction_method == "selectkbest":
                                accuracies, timings = [], []
                                targetDimensions = [
                                    8, 16, 32, 64, 128
                                ]  #[64, 128, 256, 512, 1000]
                                for dimension in targetDimensions:
                                    accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(
                                        X,
                                        y,
                                        originalPrograms,
                                        kFold=kfold,
                                        reduceDim=reduction_method,
                                        targetDim=dimension,
                                        exp2=(exp == 'exp2'))
                                    prettyPrint(
                                        "Average classification accuracy: %s%%"
                                        % (averageList(accuracyRates) * 100.0),
                                        "output")
                                    accuracies.append(
                                        averageList(accuracyRates))
                                    timings.append(averageList(allTimings))
                                    # Log classifications
                                    for foldIndex in range(
                                            len(predictedLabels)):
                                        classificationLog.write(
                                            "Target Dimensionality: %s\n" %
                                            dimension)
                                        for labelIndex in range(
                                                len(predictedLabels[foldIndex])
                                        ):
                                            classificationLog.write(
                                                "Class:%s,Predicted:%s\n" %
                                                (allClasses[groundTruthLabels[
                                                    foldIndex][labelIndex]],
                                                 allClasses[predictedLabels[
                                                     foldIndex][labelIndex]]))

                                classificationLog.close()
                                # Plot accuracies graph
                                prettyPrint("Plotting accuracies")
                                data_visualization.plotAccuracyGraph(
                                    targetDimensions, accuracies,
                                    "Number of Selected Features",
                                    "Classification Accuracy",
                                    "Classification Accuracy: Selected Features (%s)"
                                    % flavor,
                                    "accuracy_%s_%s_%s_selectkbestanother_simple.pdf"
                                    % (flavor, exp, algo))
                                # Plot performance graph
                                print(timings)
                            elif reduction_method == "pca":
                                accuracies, timings = [], []
                                targetDimensions = [
                                    8, 16, 32, 64, 128
                                ]  #[2, 4, 8, 16, 32, 64, 128, 256, 512, 1000]
                                for dimension in targetDimensions:
                                    accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(
                                        X,
                                        y,
                                        originalPrograms,
                                        kFold=kfold,
                                        reduceDim=reduction_method,
                                        targetDim=dimension,
                                        exp2=(exp == 'exp2'))
                                    prettyPrint(
                                        "Average classification accuracy: %s%%"
                                        % (averageList(accuracyRates) * 100.0),
                                        "output")
                                    accuracies.append(
                                        averageList(accuracyRates))
                                    timings.append(averageList(allTimings))
                                    # Log classifications
                                    for foldIndex in range(
                                            len(predictedLabels)):
                                        classificationLog.write(
                                            "Target Dimensionality: %s\n" %
                                            dimension)
                                        for labelIndex in range(
                                                len(predictedLabels[foldIndex])
                                        ):
                                            classificationLog.write(
                                                "Class:%s,Predicted:%s\n" %
                                                (allClasses[groundTruthLabels[
                                                    foldIndex][labelIndex]],
                                                 allClasses[predictedLabels[
                                                     foldIndex][labelIndex]]))

                                classificationLog.close()
                                # Plot accuracies graph
                                prettyPrint("Plotting accuracies")
                                data_visualization.plotAccuracyGraph(
                                    targetDimensions, accuracies,
                                    "Number of Extracted Features",
                                    "Classification Accuracy",
                                    "Classification Accuracy: PCA (%s)" %
                                    flavor,
                                    "accuracy_%s_%s_%s_pcaanother_simple.pdf" %
                                    (flavor, exp, algo))
                                # Plot performance graph
                                print(timings)
                            else:
                                accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(
                                    X,
                                    y,
                                    originalPrograms,
                                    kFold=kfold,
                                    exp2=(exp == 'exp2'))
                                prettyPrint(
                                    "Average classification accuracy: %s%%, achieved in an average of %s seconds"
                                    % (averageList(accuracyRates) * 100.0,
                                       averageList(allTimings)), "output")
                    ####################
                    # Using CART trees #
                    ####################
                    elif algo == "tree":
                        # Load data from source directory
                        X, y, allClasses, originalPrograms = loadFeaturesFromDir(
                            source_dir, flavor, 'label')
                        for splitting_criterion in ['gini']:
                            if os.path.exists(
                                    "accuracy_%s_%s_%s_%sanother_simple.pdf" %
                                (flavor, exp, splitting_criterion, algo)):
                                continue
                            classificationLog = open(
                                "classificationlog_%s_%s_%s_%sanother_simple.txt"
                                % (flavor, exp, splitting_criterion, algo),
                                "a")  # A file to log all classification labels
                            classificationLog.write(
                                "Experiment 1 - Algorithm: %s, Datatype: %s\n"
                                % (algo, flavor))
                            accuracies, timings, allDepths = [], [], [
                                4, 6, 8, 10, 12
                            ]  #,32,64]
                            for maxDepth in allDepths:
                                accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyTreeKFold(
                                    X,
                                    y,
                                    originalPrograms,
                                    kfold,
                                    splitting_criterion,
                                    int(maxDepth),
                                    visualizeTree=False,
                                    exp2=(exp == 'exp2'))
                                #print accuracyRates, allProbabilities
                                prettyPrint(
                                    "Average classification accuracy: %s%%" %
                                    (averageList(accuracyRates) * 100.0),
                                    "output")
                                accuracies.append(averageList(accuracyRates))
                                timings.append(averageList(allTimings))
                                # Log classifications
                                for foldIndex in range(len(predictedLabels)):
                                    classificationLog.write(
                                        "Tree Depth: %s\n" % maxDepth)
                                    for labelIndex in range(
                                            len(predictedLabels[foldIndex])):
                                        classificationLog.write(
                                            "Class:%s,Predicted:%s\n" %
                                            (allClasses[groundTruthLabels[
                                                foldIndex][labelIndex]],
                                             allClasses[predictedLabels[
                                                 foldIndex][labelIndex]]))

                            classificationLog.close()
                            # Plot accuracies graph
                            prettyPrint(
                                "Plotting accuracies for \"%s\" criterion" %
                                splitting_criterion)
                            data_visualization.plotAccuracyGraph(
                                allDepths, accuracies, "Maximum Tree Depth",
                                "Classification Accuracy",
                                "Classification Accuracy: %s (%s)" %
                                (splitting_criterion, flavor),
                                "accuracy_%s_%s_%s_%sanother_simple.pdf" %
                                (flavor, exp, splitting_criterion, algo))
                            print(timings)
Esempio n. 3
0
def main():
    try:
 
        argumentParser = defineArguments()
        arguments = argumentParser.parse_args()
        prettyPrint("Welcome to \"Oedipus\". Riddle me this!")

        #################################################
        # MODE 1: Generate obfuscated source code files #
        #################################################

        #done 调用Tigress生成混淆文件和.label文件(标记对应混淆文件使用了哪种混淆)
        if arguments.mode == "generate":
           if arguments.verbose == "yes":
               prettyPrint("Generating obfusted programs for programs under \"%s\"" %  arguments.sourcedir, "debug")
           # Get programs from source directory [random/pre-existent]
           sourceFiles = sorted(glob.glob("%s/*.c" % arguments.sourcedir))
           if len(sourceFiles) < 1:
               prettyPrint("No files were found in \"%s\". Exiting" % arguments.sourcedir, "error")
               return

           generationStatus = program_generation.generateObfuscatedPrograms(sourceFiles, arguments.tigressdir, int(arguments.obfuscationlevel), arguments.obfuscationfunction) # Generate obfuscated programs
            
           prettyPrint("Successfully generated obfuscated programs")
        
        #########################################################
        # MODE 2: Extract features from obfuscated source files #
        #########################################################

        #done   提取特征
        elif arguments.mode == "extract":
            # Load obfuscated files
            if not os.path.exists(arguments.sourcedir):
                prettyPrint("Unable to locate \"%s\". Exiting" % arguments.sourcedir, "error")
                return
            sourceFiles = sorted(glob.glob("%s/*.c" % arguments.sourcedir))#返回sourcedir目录下所有以.c结尾的文件并排序,sorted()返回一个新的List
            if len(sourceFiles) < 1:
                prettyPrint("No files were found in \"%s\". Exiting" % arguments.sourcedir)
            
            # Remove source files without ".label" files
            for targetFile in sourceFiles:            
                if not os.path.exists(targetFile.replace(".c", ".label")):
                    prettyPrint("File \"%s\" does not have a label/metadata file. Removing" % targetFile, "warning")
                    sourceFiles.pop( sourceFiles.index(targetFile) )#如果.c文件没有对应的.lable文件,则对其不进行后面的处理(提取TF-IDF))

            ########################################################################
            # (2.0) Extract TF-IDF features from GDB generated traces of KLEE inputs
            prettyPrint("Extracting TF-IDF from GDB traces")
            if not feature_extraction.extractTFIDF(arguments.sourcedir, sourceFiles):
                prettyPrint("Could not extract features from source files. Exiting", "error")
                return
            ########################################################################

            prettyPrint("Alright!! Alles in Ordnung.", "info2")
            cleanUp()
            return

        ###########################################################
        # MODE 3: Project data samples into <x>-dimensional space #
        ###########################################################

        #done else可以执行到????
        elif arguments.mode.find("visualize") != -1:
            if arguments.mode == "visualize":
                prettyPrint("Plotting data into %s-dimensional space with \"%s\" features." % (arguments.dimension, arguments.datatype))
                data_visualization.visualizeData(arguments.sourcedir, arguments.datatype, arguments.dimension, algorithm=arguments.visualalgorithm)
            else:
                data_visualization.visualizeOriginal(arguments.sourcedir, arguments.datatype, arguments.dimension, algorithm=arguments.visualalgorithm)

        ##############################################################################
        # MODE 4: Classify obfuscated programs using knowledge-based  classification #
        ##############################################################################

        #done
        elif arguments.mode == "classify-exp1":
           # Check the requested algorithm

           #使用朴素贝叶斯模型,操作和else中决策树基本相同(参考else中注释),只是中间调用了不同函数  
           if arguments.algorithm == "bayes":
               # Classify using Naive Bayes
               if arguments.datatype.find("idf") == -1:
                   prettyPrint("Naive Bayes does not support the data type \"%s\". Exiting" % arguments.datatype, "warning")
                   #return
               # Load data from source directory
               X, y, allClasses = loadFeaturesFromDir(arguments.sourcedir, arguments.datatype, arguments.datalabel)
               reductionMethod = raw_input("Please choose a dimensionality reduction method (selectkbest/pca): ").lower()
               classificationLog = open("classificationlog_%s_exp1_%s_%s.txt" % (arguments.datatype, reductionMethod, arguments.algorithm), "a") # A file to log all classification labels
               classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype))

               #判断使用哪种方法减少特征向量的维度
               if reductionMethod == "selectkbest":
                   accuracies, timings = [], []
                   targetDimensions = [8, 16, 32, 64, 128]#[64, 128, 256, 512, 1000]
                   for dimension in targetDimensions:
                       if arguments.verbose == "yes":
                           prettyPrint("Training a naive Bayes classifier with %s selected \"%s\" features" % (dimension, arguments.datatype), "debug")
                       accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, kFold=int(arguments.kfold), reduceDim=reductionMethod, targetDim=dimension)
                       prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output")
                       accuracies.append(averageList(accuracyRates))
                       timings.append(averageList(allTimings))
                       # Log classifications
                       for foldIndex in range(len(predictedLabels)):
                           classificationLog.write("Target Dimensionality: %s\n" % dimension)
                           for labelIndex in range(len(predictedLabels[foldIndex])):
                               classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]]))
                   
                   classificationLog.close()
                   # Plot accuracies graph
                   prettyPrint("Plotting accuracies")
                   data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Selected Features", "Classification Accuracy", "Classification Accuracy: Selected Features (%s)" % arguments.datatype, "accuracy_%s_exp1_%s_selectkbest.pdf" % (arguments.datatype, arguments.algorithm)) 
                   # Plot performance graph
                   print (timings)
                   #prettyPrint("Plotting performance")
                   #data_visualization.plotAccuracyGraph(targetDimensions, timings, "Number of Selected Features", "Classification Timing (sec)", "Classification Timing: Selected Features (%s)" % arguments.datatype) 
                  
               elif reductionMethod == "pca":
                   accuracies, timings = [], []
                   targetDimensions = [8, 16, 32, 64, 128]#[2, 4, 8, 16, 32, 64, 128, 256, 512, 1000]
                   for dimension in targetDimensions:
                       if arguments.verbose == "yes":
                           prettyPrint("Training a naive Bayes classifier with %s extracted \"%s\" features" % (dimension, arguments.datatype), "debug")
                       accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, kFold=int(arguments.kfold), reduceDim=reductionMethod, targetDim=dimension)
                       prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output")
                       accuracies.append(averageList(accuracyRates))
                       timings.append(averageList(allTimings))
                       # Log classifications
                       for foldIndex in range(len(predictedLabels)):
                           classificationLog.write("Target Dimensionality: %s\n" % dimension)
                           for labelIndex in range(len(predictedLabels[foldIndex])):
                               classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]]))

                   classificationLog.close()
                   # Plot accuracies graph
                   prettyPrint("Plotting accuracies")
                   data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Extracted Features", "Classification Accuracy", "Classification Accuracy: PCA (%s)" % arguments.datatype, "accuracy_%s_exp1_%s_pca.pdf" % (arguments.datatype, arguments.algorithm))
                   # Plot performance graph
                   print (timings)
                   #prettyPrint("Plotting performance")
                   #data_visualization.plotAccuracyGraph(targetDimensions, timings, "Number of Extracted Features", "Classification Timing (sec)", "Classification Timing: PCA (%s)" % arguments.datatype)

               else:    
                   accuracyRates, allProbabilities, allTimings, predictedLabels = classification.classifyNaiveBayes(X, y, kFold=int(arguments.kfold))
                   prettyPrint("Average classification accuracy: %s%%, achieved in an average of %s seconds" % (averageList(accuracyRates)*100.0, averageList(allTimings)), "output")
           ####################
           # Using CART trees #
           ####################
           
           #done 使用决策树、KFold训练数据,并输出结果(准确率,预期结果,实际结果,性能等)
           elif arguments.algorithm == "tree":
               # Classify using CART trees
               if arguments.datatype != "triton":
                   prettyPrint("It is recommended to use \".triton\" features", "warning")
               # Load data from source directory

               #X所有特征值,y每个文件混淆方法的索引(针对allCLasses),allClasses包含所有混淆方法
               X, y, allClasses = loadFeaturesFromDir(arguments.sourcedir, arguments.datatype, arguments.datalabel)
               splittingCriterion = raw_input("Please choose a splitting criterion (gini/entropy): ")
               classificationLog = open("classificationlog_%s_exp1_%s_%s.txt" % (arguments.datatype, splittingCriterion, arguments.algorithm), "a") # A file to log all classification labels
               classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype))
               #maxDepth = raw_input("Please choose a maximum depth for the tree (0 = Maximum Possible): ") # Should be (2,4,8,16)
               accuracies, timings, allDepths = [], [], [2,3,4,5,6,7,8,10,12,14,16]#,32,64]
               for maxDepth in allDepths:
                   if arguments.verbose == "yes":
                       prettyPrint("Training a \"CART\" with \"%s\" criterion and maximum depth of %s" % (splittingCriterion, maxDepth), "debug")
                   accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyTreeKFold(X, y, int(arguments.kfold), splittingCriterion, int(maxDepth), visualizeTree=False)
                   #print accuracyRates, allProbabilities
                   prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output")#计算多次准确率的平均值
                   accuracies.append(averageList(accuracyRates))
                   timings.append(averageList(allTimings))
                   # Log classifications
                   for foldIndex in range(len(predictedLabels)):
                       classificationLog.write("Tree Depth: %s\n" % maxDepth)
                       for labelIndex in range(len(predictedLabels[foldIndex])):
                           classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]]))

               classificationLog.close()
               # Plot accuracies graph
               prettyPrint("Plotting accuracies for \"%s\" criterion" % splittingCriterion)
               data_visualization.plotAccuracyGraph(allDepths, accuracies, "Maximum Tree Depth", "Classification Accuracy", "Classification Accuracy: %s (%s)" % (splittingCriterion, arguments.datatype), "accuracy_%s_exp1_%s_%s.pdf" % (arguments.datatype, splittingCriterion, arguments.algorithm))
               # Plot performance graph
               #prettyPrint("Plotting timings")
               #data_visualization.plotAccuracyGraph(allDepths, timings, "Maximum Tree Depth", "Classification Timing (sec)", "Classification Timing: %s (%s)" % (splittingCriterion, arguments.datatype))
               print (timings)
 
           return

        ##################################################################
        # MODE 6: Classify obfuscated programs using the 36-4 experiment #
        ##################################################################

        #done 对本来提取特征以后的tfidf文件再进行了特征提取,而且用到KFold交叉验证方法
        elif arguments.mode == "classify-exp2":
            # Retrieve the list of all programs
            allPrograms = glob.glob("%s/*.c" % arguments.originalprograms)#list(set(sorted(glob.glob("%s/*.c" % arguments.sourcedir))) - set(sorted(glob.glob("%s/*-*.c" % arguments.sourcedir))))
            allPrograms.sort() # Makes it easier to keep track of current programs in batch
            totalPrograms = len(allPrograms)
            prettyPrint("Successfully retrieved %s original programs" % totalPrograms)
            chunkSize =  totalPrograms/int(arguments.kfold) # 4 = 40 / 10 (default)

            if arguments.algorithm == "tree":
                criterion = raw_input("Please choose a splitting criterion (gini/entropy): ")
                allValues = [2,3,4,5,6,7,8,10,12,14,16]#,32,64] # The allowed depths of the tree
            elif arguments.algorithm == "bayes":
                criterion = raw_input("Please choose a dimensionality reduction method (SelectKBest/PCA): ").lower()
                allValues = [8,16,32,64,128]# if criterion.lower() == "selectkbest" else [8,16,32,64,128]       
            # Define the structure of the accuracy and timing matrices
            # numpy.zeros(x,y)创建x行y列的矩阵
            allAccuracyRates, allTimings = numpy.zeros((int(arguments.kfold), len(allValues))), numpy.zeros((int(arguments.kfold), len(allValues)))
            classificationLog = open("classificationlog_%s_exp2_%s_%s.txt" % (arguments.datatype, criterion, arguments.algorithm), "a") # A file to log all classification labels
            classificationLog.write("Experiment 2 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype))

            for currentCycle in range(10):
                prettyPrint("Cycle #%s out of %s cycles" % (currentCycle+1, int(arguments.kfold)))
                trainingPrograms, testPrograms = [] + allPrograms, []
                # Specify the indices of the training and test datasets
                testStartIndex = (totalPrograms + (chunkSize * currentCycle)) % totalPrograms
                testStopIndex = testStartIndex + chunkSize
                if arguments.verbose == "yes":
                    prettyPrint("Retrieving training and test programs for the current cycle", "debug")
                # Populate the test dataset
                testPrograms = trainingPrograms[testStartIndex:testStopIndex]
                # Remove the indices from trainingPrograms
                trainingPrograms = [x for x in trainingPrograms if not x in trainingPrograms[testStartIndex:testStopIndex]]

                if arguments.verbose == "yes":
                    prettyPrint("Original training programs: %s, original test programs: %s" % (len(trainingPrograms), len(testPrograms)), "debug")
                # Now load the training and test samples from the source directory
                # 1- First we need to retrieve the obfuscated versions of the
                tempTraining, tempTest = [], []

                #得到训练集、测试集(.c文件)中每个文件对应的保存特征向量的文件(例如:.tfidf文件)
                for program in trainingPrograms:
                    programName = program.replace(arguments.originalprograms, "").replace("/","") # Isolate program name 去掉路径,仅保留文件名
                    # TODO: Important: For 40 programs, programs are like "anagram_1231231231_12.c"
                    # TODO: for "obf" programs, programs are like "empty-Seed1-Random......-addOpaque16.c"
                    separator = "_" if arguments.sourcedir.find("40programs") != - 1 else "-"
                    #print "%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype)
                    obfuscatedVersions = glob.glob("%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype)) 
                    #print programName, len(obfuscatedVersions)
                    #print "%s/%s_*.%s" % (arguments.sourcedir, programName.replace(".c", ""), arguments.datatype)
                    if len(obfuscatedVersions) > 0:
                        tempTraining += obfuscatedVersions
                    #print programName, len(obfuscatedVersions)
                for program in testPrograms:
                    programName = program.replace(arguments.originalprograms, "").replace("/","") # Isolate program name
                    # TODO: Important: For 40 programs, programs are like "anagram_1231231231_12.c"
                    # TODO: for "obf" programs, programs are like "empty-Seed1-Random......-addOpaque16.c"
                    separator = "_" if arguments.sourcedir.find("40programs") != - 1 else "-"
                    obfuscatedVersions = glob.glob("%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype)) 
                    if len(obfuscatedVersions) > 0:
                       tempTest += obfuscatedVersions
                trainingPrograms, testPrograms = tempTraining, tempTest # Update the training and test programs

                if arguments.verbose == "yes":
                    prettyPrint("Successfully retrieved %s training and %s test programs" % (len(trainingPrograms), len(testPrograms)), "debug")
                # (Added January 15): Generate the TF-IDF features on the fly
                if arguments.verbose == "yes":
                    prettyPrint("Generating TF-IDF features for the current training and test traces", "debug")

                #对.tfidf文件继续采用TF-IDF再提取特征,保存到.ifidf_str文件中
                if feature_extraction.extractTFIDFMemoryFriendly(trainingPrograms, arguments.datatype, 128, "%s_tr" % arguments.datatype):
                    prettyPrint("Successfully generated TF-IDF features for the current training batch") 
                else:
                    prettyPrint("Unable to generate TF-IDF features for the current training batch", "warning")
                    continue
                # Now for the test batch
                if feature_extraction.extractTFIDFMemoryFriendly(testPrograms, arguments.datatype, 128, "%s_te" % arguments.datatype):
                    prettyPrint("Successfully generated TF-IDF features for the current test batch")
                else:
                    prettyPrint("Unable to generate TF-IDF features for the current test batch", "warning")
                    continue

                # Now load the programs of the given datatype
                prettyPrint("Loading training and test instances")
                Xtr, ytr, allClassestr = loadFeaturesFromList(trainingPrograms, "%s_tr" % arguments.datatype, arguments.datalabel)
                Xte, yte, allClasseste = loadFeaturesFromList(testPrograms, "%s_te" % arguments.datatype, arguments.datalabel, allClassestr)
                # Now apply the classification algorithm 
                # 训练模型
                for value in allValues:
                    ##############
                    # CART Trees #
                    ##############
                    if arguments.algorithm == "tree":
                        prettyPrint("Training a \"CART\" with \"%s\" criterion and maximum depth of %s" % (criterion, value), "debug")
                        currentAccuracyRate, currentTiming, currentProbabilities, predictedLabels = classification.classifyTree(Xtr, ytr, Xte, yte, criterion, int(value), visualizeTree=False)
                        prettyPrint("Classification accuracy with \"%s\" and \"%s\" is: %s%%" % (criterion, value, (currentAccuracyRate*100.0)), "output")
                        #print "before!!!! currentCycle: %s, value: %s, allValues.index(value): %s" % (currentCycle, value, allValues.index(value))
                        allAccuracyRates[currentCycle][allValues.index(value)] = currentAccuracyRate
                        allTimings[currentCycle][allValues.index(value)] = currentTiming
                        #print "after assignments"
                        # Log the results
                        classificationLog.write("Depth: %s\n" % value)
                        #print len(yte), len(predictedLabels), len(testPrograms)
                        for index in range(len(testPrograms)): 
                            classificationLog.write("%s: Class: %s, Predicted: %s\n" % (testPrograms[index], allClasseste[yte[index]], allClasseste[predictedLabels[index]]))
                        #print "after writing"
                    ###########################
                    # Multinomial Naive Bayes #
                    ###########################
                    elif arguments.algorithm == "bayes":
                        prettyPrint("Training a \"Multinomial Naive Bayes\" with \"%s\" criterion and dimensionality of %s" % (criterion, value), "debug")
                        currentAccuracyRate, currentTiming, currentProbabilities, predictedLabels = classification.classifyNaiveBayes(Xtr, ytr, Xte, yte, criterion, int(value))

                        #print accuracyRates, allProbabilities
                        prettyPrint("Classification accuracy with \"%s\" and \"%s\" is: %s%%" % (criterion, value, (currentAccuracyRate*100.0)), "output")
                        allAccuracyRates[currentCycle][allValues.index(value)] = currentAccuracyRate
                        allTimings[currentCycle][allValues.index(value)] = currentTiming
                        # Log the results
                        classificationLog.write("Dimensionality: %s\n" % value)
                        #print len(yte), len(predictedLabels), len(testPrograms)
                        for index in range(len(testPrograms)): 
                            classificationLog.write("%s: Class: %s, Predicted: %s\n" % (testPrograms[index], allClasseste[yte[index]], allClasseste[predictedLabels[index]]))
                
                # TODO (Added January 15): Remove all TF-IDF files of the current batch
                if arguments.verbose == "yes":
                    prettyPrint("Removing all TF-IDF files of the current batch", "debug")
                rmCounter = 0
                for featureFile in glob.glob("%s/*.%s_t*" % (arguments.sourcedir, arguments.datatype)): # TODO: This will remove tfidf_both you stupid f**k!!
                    os.unlink(featureFile)
                    rmCounter += 1
                prettyPrint("Successfully removed %s files" % rmCounter)
                    
            classificationLog.close()
            # Now average the scored results stored in the matrices
            pointsX, pointsYacc, pointsYtime = [], [], []
            for value in allValues:
                pointsX.append(value)
                pointsYacc.append(averageList(allAccuracyRates[:,allValues.index(value)]))
                pointsYtime.append(averageList(allTimings[:,allValues.index(value)]))
             # Plot accuracies and timings graphs
            if arguments.algorithm == "tree":
                xAxisLabel = "Maximum Tree Depth"
            elif arguments.algorithm == "bayes":
                xAxisLabel = "Selected Features" if criterion == "select" else "Extracted Features"
           
            prettyPrint("Plotting accuracies for \"%s\" criterion" % criterion)
            data_visualization.plotAccuracyGraph(pointsX, pointsYacc, xAxisLabel, "Classification Accuracy", "Classification Accuracy: %s (%s)" % (criterion, arguments.datatype), "accuracy_%s_exp2_%s_%s.pdf" % (arguments.datatype, criterion, arguments.algorithm))
            #prettyPrint("Plotting timings")
            #data_visualization.plotAccuracyGraph(pointsX, pointsYtime, "Maximum Tree Depth", "Classification Timing (sec)", "Classification Timing: %s (%s)" % (criterion, arguments.datatype))

        ####################################
        # MODE X : Filter generated traces #
        ####################################

        #done 过滤数据
        elif arguments.mode == "filter-traces":
            # Retrieve the necessary parameters
            inExtension = raw_input("Input extension (Default: dyndis): ")
            outExtension = raw_input("Output extension (Default: dyndis_raw): ")
            filterMode = raw_input("Filteration mode {raw (Default), mem, both}: ")
            if filterTraces(arguments.sourcedir, inExtension, filterMode, outExtension, arguments.filterfunction):
                prettyPrint("Successfully filtered \"%s\" traces to \"%s\" traces using the \"%s\" filter" % (inExtension, outExtension, filterMode))
            else:
               prettyPrint("Some error occurred during filteration", "warning")

        ########################################################
        # MODE XI: Generate TF-IDF feature vectors from traces #
        ########################################################

        #done 对输入文件提取指定的特征
        elif arguments.mode == "extract-from-traces":
            # Retrieve the necessary paramters
            inExtension = raw_input("Input extension (Default: dyndis): ")
            outExtension = raw_input("Output extension (Default: tfidf_raw): ")
            maxFeatures = int(raw_input("Maximum features: "))
            if feature_extraction.extractTFIDFMemoryFriendly(arguments.sourcedir, inExtension, maxFeatures, outExtension):
                prettyPrint("Successfully extracted %s TF-IDF features from traces with \"%s\" extension" % (maxFeatures, inExtension))
            else:
                prettyPrint("Some error occurred during TF-IDF feature extraction", "warning")

    except Exception as e:
        #global garbage
        prettyPrint("Error encountered in \"main\": %s at line %s" % (e, sys.exc_info()[2].tb_lineno), "error")
        #print garbage
        cleanUp()
        return