def loadLabelFromFile(fileName): """ Loads clusters from file into a string """ cluster, params = "", {} if not os.path.exists(fileName): prettyPrint("Could not find \"%s\". Skipping" % fileName, "warning") return cluster, params rawData = open(fileName).read() if fileName.find(".label") != -1: # It's a label file cluster, params = rawData.split('\n')[0], {} else: # It's a metadata file if rawData.find("Ident") != -1: cluster, params = "Ident", {} else: for token in rawData[1:-1].split(","): if token.find("Transform") != -1: cluster = cluster + token.split("=")[1].replace("'","") + "_" else: if token.find("=") != -1 and token.find("Functions") == -1 and token.find("out") == -1: key = token.split('=')[0].replace('-','').replace("'","") value = token.split('=')[1].replace("'","") params[key] = value cluster = cluster[:-1] if cluster[-1] == "_" else cluster # Clip any trailing underscores return cluster, params
def loadArgumentsFromKLEE(fileName): """ Parses KLEE testcase, saves arguments to file, and returns a list of retrieved arguments """ fileContent = open(fileName, "rb").read().split("\n") if len(fileContent) < 1: prettyPrint("KLEE testcase file is empty", "warning") return [], "" args, numArgs = [], 0 argFlag = False # Retrieve the number of arguments for row in fileContent: if row.find("arg") != -1 and row.find("n_args") == -1 and row.find( "--sym-args") == -1: argFlag = True elif row.find("data: ") != -1 and argFlag: argIndex = row.find("data: ") + len("data: ") args.append(row[argIndex:].replace("'", "").decode("string_escape")) argFlag = False # Now write arguments to file inputFile = open(fileName.replace(".txt", ".input"), "wb") for arg in args: inputFile.write(arg) inputFile.write(" ") inputFile.close() return args, fileName.replace(".txt", ".input")
def loadLabelFromFile(fileName): """ Loads clusters from file into a string """ cluster, params = "", {} if not os.path.exists(fileName): prettyPrint("Could not find \"%s\". Skipping" % fileName, "warning") return cluster, params rawData = open(fileName).read() if fileName.find(".label") != -1: # It's a label file cluster, params = rawData.split('\n')[0], {} else: # It's a metadata file if rawData.find("Ident") != -1: cluster, params = "Ident", {} else: for token in rawData[1:-1].split(","): if token.find("Transform") != -1: cluster = cluster + token.split("=")[1].replace("'", "") + "_" else: if token.find("=") != -1 and token.find( "Functions") == -1 and token.find("out") == -1: key = token.split('=')[0].replace('-', '').replace("'", "") value = token.split('=')[1].replace("'", "") params[key] = value cluster = cluster[:-1] if cluster[ -1] == "_" else cluster # Clip any trailing underscores return cluster, params
def loadAlphaSequences(fileName, sequenceSize=0): """ Loads alpha sequences from a file into a list of characters """ alphaSequence = [] if not os.path.exists(fileName): prettyPrint("File \"%s\" was not found" % fileName, "warning") rawSequence = open(fileName).read() for alpha in rawSequence: if alpha != '' and alpha != '\n': alphaSequence.append(alpha) if sequenceSize == 0 or sequenceSize > len(alphaSequence): return alphaSequence else: return alphaSequence[:sequenceSize]
def sequenceToAlpha( behavior ): """ Converts an instruction trace into an alphabet sequence """ alphaSequence = "" global availableLetters global sequenceAlphaMap try: if type(behavior) == str: behavior = behavior.split(',') for action in behavior: if not action in sequenceAlphaMap.keys(): sequenceAlphaMap[ action ] = availableLetters.pop(0) alphaSequence += sequenceAlphaMap[ action ] except Exception as e: prettyPrint("Error encountered while converting trace into alpha sequence: %s" % e, "error") prettyPrint("Length of current sequence is \"%s\"" % len(alphaSequence)) return alphaSequence
def sequenceToAlpha(behavior): """ Converts an instruction trace into an alphabet sequence """ alphaSequence = "" global availableLetters global sequenceAlphaMap try: if type(behavior) == str: behavior = behavior.split(',') for action in behavior: if not action in sequenceAlphaMap.keys(): sequenceAlphaMap[action] = availableLetters.pop(0) alphaSequence += sequenceAlphaMap[action] except Exception as e: prettyPrint( "Error encountered while converting trace into alpha sequence: %s" % e, "error") prettyPrint("Length of current sequence is \"%s\"" % len(alphaSequence)) return alphaSequence
def loadArgumentsFromKLEE(fileName): """ Parses KLEE testcase, saves arguments to file, and returns a list of retrieved arguments """ fileContent = open(fileName, "rb").read().split("\n") if len(fileContent) < 1: prettyPrint("KLEE testcase file is empty", "warning") return [], "" args, numArgs = [], 0 argFlag = False # Retrieve the number of arguments for row in fileContent: if row.find("arg") != -1 and row.find("n_args") == -1 and row.find("--sym-args") == -1: argFlag = True elif row.find("data: ") != -1 and argFlag: argIndex = row.find("data: ") + len("data: ") args.append(row[argIndex:].replace("'","").decode("string_escape")) argFlag = False # Now write arguments to file inputFile = open(fileName.replace(".txt",".input"), "wb") for arg in args: inputFile.write(arg) inputFile.write(" ") inputFile.close() return args, fileName.replace(".txt", ".input")
def loadFeaturesFromList(dataFiles, dataType, labelExtension="metadata", classReference=[]): """ Loads features from a list of files """ features = [] # Retrieve all files if len(dataFiles) < 1: prettyPrint("No data files of type \"%s\" were found." % dataType, "warning") return numpy.array([]), numpy.array([]) # Iterate over files adding their values to an array dataPoints, dataLabels, allClasses = [], [], [] labelFile = "" # TODO: Again for KLEE test files for dataFile in dataFiles: currentExtension = dataFile[dataFile.rfind("."):] #if未完成 不考虑 if labelExtension == "label": # TODO: Accomodate for the KLEE files if dataFile.find("test") != -1: labelFile = dataFile[:dataFile.rfind("_test")] + ".label" if not os.path.exists(labelFile): prettyPrint( "Could not find a label file for \"%s\". Skipping" % dataFile, "warning") continue else: if not os.path.exists(dataFile.replace(dataType, "label")): prettyPrint( "Could not find a label file for \"%s\". Skipping" % dataFile, "warning") continue dataFile = dataFile.replace( currentExtension, ".%s" % dataType) # Make sure we're loading from the right extension #将所有文件的特征值存入dataPoints if dataType.find( "tfidf" ) != -1 or dataType == "freq" or dataType == "util" or dataType == "hmm": # Load features as numerical dataPoints.append( [float(x) for x in open(dataFile).read()[1:-1].split(',')]) #print dataPoints elif dataType == "triton": # Load features as numerical/nominal content = open(dataFile).read().replace("\n", "").replace(" ", "")[1:-1] features = content.split(",") for index in range(len(features)): features[index] = features[index].replace("'", "") if features[index].isdigit(): features[index] = int(features[index]) elif features[index].find(".") != -1: features[index] = float(features[index]) else: # Numerizing "Yes" and "No" if features[index].lower() == "yes": features[index] = 1.0 else: features[index] = 0.0 # Append to dataPoints dataPoints.append(features) elif dataType == "seq" or dataType == "parseq": # Load features as sequence of strings dataPoints.append(open(dataFile).read()) # Also add the class label if labelExtension == "label": if labelFile != "": currentClass, currentParams = loadLabelFromFile(labelFile) else: currentClass, currentParams = loadLabelFromFile( dataFile.replace(".%s" % dataType, ".label")) elif labelExtension == "metadata": currentClass, currentParams = loadLabelFromFile( dataFile[:dataFile.rfind("_test")] + ".metadata") for attribute in currentParams: currentClass += "_%s_%s" % (attribute, currentParams[attribute]) currentClass = currentClass.replace(" ", "") # Get rid of any spaces # Translate that to integers if currentClass in classReference: dataLabels.append(classReference.index(currentClass)) else: classReference.append(currentClass) dataLabels.append(classReference.index( currentClass)) # Add an index as the class label # Now return the data points and labels as lists return dataPoints, dataLabels, classReference
def filterTraces(sourceDir, inExtension, filterMode, outExtension, targetFunction="main"): """ Filters the GDB generated traces according to the supplied [filterMode] """ immReg = r'\$0x\w+' memReg = r'0x\w+' # Retrieve list of files from input dir allfiles = glob.glob("%s/*.%s" % (sourceDir, inExtension)) if len(allfiles) < 1: prettyPrint( "Unable to retrieve \"*.%s\" from \"%s\"" % (inExtension, sourceDir), "warning") return False prettyPrint( "Successfully retrieved %s \"*.%s\" from \"%s\"" % (len(allfiles), inExtension, sourceDir), "debug") filecounter = 0 previousline = "" # Loop on retrieved file and filter their content for inputfile in allfiles: prettyPrint( "Processing file: %s, #%s out of %s" % (inputfile, filecounter + 1, len(allfiles)), "debug") content = open(inputfile).read() outputfile = open(inputfile.replace(inExtension, outExtension), "w") alllines = content.split('\n') inMain = False ''' if else 中处理方法根据具体文件内容而写 ''' #静态,保存只和targetFunction有关的指令部分 if inExtension.find("objdump") != -1 or inExtension.find( "objdumps") != -1: rawlines = [] for line in alllines: if line.find("<%s>" % targetFunction) != -1: inMain = True elif line.find(">:") != -1: inMain = False if inMain and len(line.split('\t')) > 2: if line.find("call") != -1 or line.find("callq") != -1: functionName = line[line.rfind('<') + 1:line.rfind('>')] rawlines.append("%s()" % functionName) else: rawlines.append(line.split('\t')[-1]) else: #动态,保存只和targetFunction有关的指令部分 rawlines = [] for line in alllines: if line.find("=>") != -1 and line.find(targetFunction) != -1: rawlines.append(line[line.find(':') + 1:]) else: # Not a target function # Check whether it is a "call" instruction if line.find("call") != -1 or line.find("callq") != -1: if line.find("%") == -1: functionName = line[line.rfind("<") + 1:line.rfind("+")] line = "%s()" % functionName # Now filter them #对过滤以后的数据 再进行处理后 写入目标文件 for templine in rawlines: # Match and replace immediate and memory values # Are we allowed to filter immediate values as well? if filterMode.lower() == "both": # Yes, then get rid of the immediate first (the more specific) templine = re.sub(immReg, "imm", templine) #替换templine中的匹配项 templine = re.sub(memReg, "mem", templine) # elif filterMode.lower() == "mem": # No, then check whether this is an immediate match if re.search(immReg, templine): #扫描整个字符串并返回第一个成功的匹配,否则返回None # ... and skip pass else: # Otherwise, just filter the memory location templine = re.sub(memReg, "mem", templine) elif filterMode.lower() == "raw": # Leave both the memory and immediate values alone templine = templine else: prettyPrint( "Unknown filter mode \"%s\". Exiting." % filterMode, "warning") # Remove commas templine = templine.replace(',', ' ') # Write the instruction to file #instruction = templine.split() #finalline = "" #for i in instruction: # finalline += " %s" % i outputfile.write("%s\n" % templine) filecounter += 1 prettyPrint( "Successfully processed %s \"*.%s\"." % (filecounter, inExtension), "debug") ''' 应该在for循环中 ''' outputfile.close() return True
def loadFeaturesFromList(dataFiles, dataType, labelExtension="metadata", classReference=[]): """ Loads features from a list of files """ features = [] # Retrieve all files if len(dataFiles) < 1: prettyPrint("No data files of type \"%s\" were found." % dataType, "warning") return numpy.array([]), numpy.array([]) # Iterate over files adding their values to an array dataPoints, dataLabels, allClasses = [], [], [] labelFile = "" # TODO: Again for KLEE test files for dataFile in dataFiles: currentExtension = dataFile[dataFile.rfind("."):] if labelExtension == "label": # TODO: Accomodate for the KLEE files if dataFile.find("test") != -1: labelFile = dataFile[:dataFile.rfind("_test")] + ".label" if not os.path.exists(labelFile): prettyPrint("Could not find a label file for \"%s\". Skipping" % dataFile, "warning") continue else: if not os.path.exists(dataFile.replace(dataType, "label")): prettyPrint("Could not find a label file for \"%s\". Skipping" % dataFile, "warning") continue dataFile = dataFile.replace(currentExtension,".%s" % dataType) # Make sure we're loading from the right extension if dataType.find("tfidf") != -1 or dataType == "freq" or dataType == "util" or dataType == "hmm": # Load features as numerical dataPoints.append([float(x) for x in open(dataFile).read()[1:-1].split(',')]) #print dataPoints elif dataType == "triton": # Load features as numerical/nominal content = open(dataFile).read().replace("\n", "").replace(" ", "")[1:-1] features = content.split(",") for index in range(len(features)): features[index] = features[index].replace("'","") if features[index].isdigit(): features[index] = int(features[index]) elif features[index].find(".") != -1: features[index] = float(features[index]) else: # Numerizing "Yes" and "No" if features[index].lower() == "yes": features[index] = 1.0 else: features[index] = 0.0 # Append to dataPoints dataPoints.append(features) elif dataType == "seq" or dataType == "parseq": # Load features as sequence of strings dataPoints.append(open(dataFile).read()) # Also add the class label if labelExtension == "label": if labelFile != "": currentClass, currentParams = loadLabelFromFile(labelFile) else: currentClass, currentParams = loadLabelFromFile(dataFile.replace(".%s" % dataType, ".label")) elif labelExtension == "metadata": currentClass, currentParams = loadLabelFromFile(dataFile[:dataFile.rfind("_test")] + ".metadata") for attribute in currentParams: currentClass += "_%s_%s" % (attribute, currentParams[attribute]) currentClass = currentClass.replace(" ","") # Get rid of any spaces # Translate that to integers if currentClass in classReference: dataLabels.append(classReference.index(currentClass)) else: classReference.append(currentClass) dataLabels.append(classReference.index(currentClass)) # Add an index as the class label # Now return the data points and labels as lists return dataPoints, dataLabels, classReference
def filterTraces(sourceDir, inExtension, filterMode, outExtension, targetFunction="main"): """ Filters the GDB generated traces according to the supplied [filterMode] """ immReg = r'\$0x\w+' memReg = r'0x\w+' # Retrieve list of files from input dir allfiles = glob.glob("%s/*.%s" % (sourceDir, inExtension)) if len(allfiles) < 1: prettyPrint("Unable to retrieve \"*.%s\" from \"%s\"" % (inExtension, sourceDir), "warning") return False prettyPrint("Successfully retrieved %s \"*.%s\" from \"%s\"" % (len(allfiles), inExtension, sourceDir), "debug") filecounter = 0 previousline = "" # Loop on retrieved file and filter their content for inputfile in allfiles: prettyPrint("Processing file: %s, #%s out of %s" % (inputfile, filecounter+1, len(allfiles)), "debug") content = open(inputfile).read() outputfile = open(inputfile.replace(inExtension, outExtension), "w") alllines = content.split('\n') inMain = False if inExtension.find("objdump") != -1 or inExtension.find("objdumps") != -1: rawlines = [] for line in alllines: if line.find("<%s>" % targetFunction) != -1: inMain = True elif line.find(">:") != -1: inMain = False if inMain and len(line.split('\t')) > 2: if line.find("call") != -1 or line.find("callq") != -1: functionName = line[line.rfind('<')+1:line.rfind('>')] rawlines.append("%s()" % functionName) else: rawlines.append(line.split('\t')[-1]) else: rawlines = [] for line in alllines: if line.find("=>") != -1 and line.find(targetFunction) != -1: rawlines.append(line[line.find(':')+1:]) else: # Not a target function # Check whether it is a "call" instruction if line.find("call") != -1 or line.find("callq") != -1: if line.find("%") == -1: functionName = line[line.rfind("<")+1:line.rfind("+")] line = "%s()" % functionName # Now filter them for templine in rawlines: # Match and replace immediate and memory values # Are we allowed to filter immediate values as well? if filterMode.lower() == "both": # Yes, then get rid of the immediate first (the more specific) templine = re.sub(immReg, "imm", templine) templine = re.sub(memReg, "mem", templine) elif filterMode.lower() == "mem": # No, then check whether this is an immediate match if re.search(immReg, templine): # ... and skip pass else: # Otherwise, just filter the memory location templine = re.sub(memReg, "mem", templine) elif filterMode.lower() == "raw": # Leave both the memory and immediate values alone templine = templine else: prettyPrint("Unknown filter mode \"%s\". Exiting." % filterMode, "warning") # Remove commas templine = templine.replace(',', ' ') # Write the instruction to file #instruction = templine.split() #finalline = "" #for i in instruction: # finalline += " %s" % i outputfile.write("%s\n" % templine) filecounter += 1 prettyPrint("Successfully processed %s \"*.%s\"." % (filecounter, inExtension), "debug") outputfile.close() return True